vastdb.table

VAST Database table.

class vastdb.table.BlobExpansion(source_column_name: str, target_table_name: str, columns: list[tuple[str, pyarrow.DataType, dict]], config: BlobExpansionConfig, _table_metadata: TableMetadata, _tx: Transaction)[source]

Bases: object

VAST blob expansion.

add_columns(columns_to_add: pyarrow.Schema | None = None, add_copy_source_column: bool = False, add_missing_values_output: bool = False, add_excessive_values_output: bool = False) → None[source]: Add columns to this blob expansion.

columns: list[tuple[str, pyarrow.DataType, dict]]

config: BlobExpansionConfig

drop() → None[source]: Drop this blob expansion.

drop_columns(columns_to_remove: pyarrow.Schema | None = None, remove_copy_source_column: bool = False, remove_missing_values_output: bool = False, remove_excessive_values_output: bool = False) → None[source]: Remove columns from this blob expansion.

source_column_name: str

property table_metadata: TableMetadata: Return the metadata of the target table.

target_table_name: str

property tx: Transaction: Return the transaction.

class vastdb.table.BlobExpansionConfig(expansion_format: ExpansionFormat = ExpansionFormat.JSON, copy_source_column: bool = False, flatten_path: bool = False, flatten_delimiter: str = '__', add_missing_values_output: bool = True, add_excessive_values_output: bool = True)[source]

Bases: object

Configuration for blob expansion.

add_excessive_values_output: bool = True

add_missing_values_output: bool = True

copy_source_column: bool = False

expansion_format: ExpansionFormat = 'json'

flatten_delimiter: str = '__'

flatten_path: bool = False

class vastdb.table.ExpansionFormat(value, names=<not given>, *values, module=None, qualname=None, type=None, start=1, boundary=None)[source]

Bases: Enum

Format for blob expansion data parsing.

JSON = 'json': Parse blob data as JSON.

class vastdb.table.ITable[source]

Bases: ABC

Interface for VAST Table operations.

abstract property arrow_schema: pyarrow.Schema: Table arrow schema.

abstract delete(rows: pyarrow.RecordBatch | pyarrow.Table) → None[source]: Delete rows from table.

abstract import_files(files_to_import: Iterable[str], config: ImportConfig | None = None) → None[source]: Import files into table.

abstract import_partitioned_files(files_and_partitions: dict[str, pyarrow.RecordBatch], config: ImportConfig | None = None) → None[source]: Import partitioned files.

abstract imports_table() → ITable | None[source]: Get imports table.

abstract insert(rows: pyarrow.RecordBatch | pyarrow.Table) → pyarrow.ChunkedArray[source]: Insert rows into table.

abstract property name: str: Table name.

abstract property path: str: Return table’s path.

abstract projection(name: str) → Projection[source]: Get a specific semi-sorted projection of this table.

abstract projections(projection_name: str = '') → Iterable[Projection][source]: List semi-sorted projections.

abstract property ref: TableRef: Return Table Ref.

abstract reload_schema() → None[source]: Reload Arrow Schema.

abstract reload_sorted_columns() → None[source]: Reload Sorted Columns.

abstract reload_stats() → None[source]: Reload Table Stats.

abstract retrieve_column_names() → Sequence[str][source]: Fetch column names.

abstract select(columns: list[str] | None = None, predicate: ibis.expr.types.BooleanColumn | ibis.common.deferred.Deferred | None = None, config: QueryConfig | None = None, *, internal_row_id: bool = False, limit_rows: int | None = None) → pyarrow.RecordBatchReader[source]: Execute a query.

abstract sorted_columns() → list[str][source]: Return sorted columns’ names.

abstract sorting_done() → bool[source]: Check if sorting is done.

abstract sorting_score() → int[source]: Get sorting score.

abstract update(rows: pyarrow.RecordBatch | pyarrow.Table, columns: list[str] | None = None) → None[source]: Update rows in table.

abstract property vector_index: VectorIndex | None: Table’s Vector Index if exists.

abstract vector_search(vec: list[float], columns: list[str], limit: int, predicate: ibis.expr.types.BooleanColumn | ibis.common.deferred.Deferred | None = None) → pyarrow.RecordBatchReader[source]: Top-n on vector-column.

class vastdb.table.Projection(name: str, table_metadata: TableMetadata, stats: TableStats, handle: int, tx: Transaction)[source]

Bases: object

VAST semi-sorted projection.

columns() → pyarrow.Schema[source]: Return this projections’ columns as an Arrow schema.

drop() → None[source]: Drop this projection.

handle: int

name: str

rename(new_name: str) → None[source]: Rename this projection.

stats: TableStats

table_metadata: TableMetadata

tx: Transaction

class vastdb.table.Table(metadata: TableMetadata, handle: int, tx: Transaction)[source]

Bases: TableInTransaction

Vast Interactive Table.

add_column(new_column: pyarrow.Schema) → None[source]: Add a new column.

add_sorting_key(sorting_key: list[int] | list[str]) → None[source]: Add a sorting key to a table that doesn’t have any.

columns() → pyarrow.Schema[source]: Return columns’ metadata.

create_blob_expansion(expansion_schema: pyarrow.Schema, target_table_name: str, source_column_name: str = 'value', config: BlobExpansionConfig | None = None, target_table_schema: str | None = None) → BlobExpansion[source]: Create a blob expansion for the given source column.

create_imports_table(fail_if_exists=True) → ITable[source]: Create imports table.

create_projection(projection_name: str, sorted_columns: list[str], unsorted_columns: list[str]) → Projection[source]: Create a new semi-sorted projection.

drop() → None[source]: Drop this table.

drop_column(column_to_drop: pyarrow.Schema) → None[source]: Drop an existing column.

get_stats() → TableStats[source]: Get the statistics of this table.

property handle: int: Table Handle.

imports_table() → Table | None[source]: Get the imports table of this table.

rename(new_name: str) → None[source]: Rename this table.

rename_column(current_column_name: str, new_column_name: str) → None[source]: Rename an existing column.

sorted_columns() → list[pyarrow.Field][source]: Return sorted columns’ metadata.

property sorted_table: bool: Is table a sorted table.

property stats: TableStats: Fetch table’s statistics from server.

property tx: Return transaction.

class vastdb.table.TableInTransaction(metadata: TableMetadata, tx: Transaction)[source]

Bases: ITable

VAST Table.

property arrow_schema: pyarrow.Schema: Table arrow schema.

blob_expansion(source_column_name: str = 'value') → BlobExpansion[source]: Get a blob expansion by source column name.

delete(rows: pyarrow.RecordBatch | pyarrow.Table) → None[source]

Delete a subset of rows in this table.

Row IDs are specified using a special field (named “$row_id” of uint64 type).

import_files(files_to_import: Iterable[str], config: ImportConfig | None = None) → None[source]

Import a list of Parquet files into this table.

The files must be on VAST S3 server and be accessible using current credentials.

import_partitioned_files(files_and_partitions: dict[str, pyarrow.RecordBatch], config: ImportConfig | None = None) → None[source]

Import a list of Parquet files into this table.

The files must be on VAST S3 server and be accessible using current credentials. Each file must have its own partition values defined as an Arrow RecordBatch.

imports_table() → ITable | None[source]: Get the imports table of this table.

imports_table_metadata() → TableMetadata[source]: Get TableMetadata for import table.

insert(rows: pyarrow.RecordBatch | pyarrow.Table, by_columns: bool = True) → pyarrow.ChunkedArray[source]: Insert a RecordBatch into this table.

insert_in_column_batches(rows: pyarrow.RecordBatch) → pyarrow.ChunkedArray[source]

Split the RecordBatch into an insert + updates.

This is both to support rows that won’t fit into an RPC and for performance for wide rows. Insert first MAX_COLUMN_IN_BATCH columns and get the row_ids. Then loop on the rest of the columns and update in groups of MAX_COLUMN_IN_BATCH.

property name: str: Table name.

property path: str: Return table’s path.

projection(name: str, include_stats: bool = True) → Projection[source]: Get a specific semi-sorted projection of this table.

projections(projection_name: str = '', include_stats: bool = True) → Iterable[Projection][source]

List all semi-sorted projections of this table if projection_name is empty.

Otherwise, list only the specific projection (if exists).

property ref: TableRef: Table Reference.

reload_schema() → None[source]: Reload Arrow Schema.

reload_sorted_columns() → None[source]: Reload Sorted Columns.

reload_stats() → None[source]: Reload Table Stats.

retrieve_column_names() → Sequence[str][source]: Fetch column names.

select(columns: list[str] | None = None, predicate: ibis.expr.types.BooleanColumn | ibis.common.deferred.Deferred = None, config: QueryConfig | None = None, *, internal_row_id: bool = False, limit_rows: int | None = None) → pyarrow.RecordBatchReader[source]

Execute a query over this table.

To read a subset of the columns, specify their names via columns argument. Otherwise, all columns will be read.

In order to apply a filter, a predicate can be specified. See https://github.com/vast-data/vastdb_sdk/blob/main/README.md#filters-and-projections for more details.

Query-execution configuration options can be specified via the optional config argument.

select_splits(columns: list[str] | None = None, predicate: ibis.expr.types.BooleanColumn | ibis.common.deferred.Deferred = None, config: QueryConfig | None = None, *, internal_row_id: bool = False, limit_rows: int | None = None) → list[pyarrow.RecordBatchReader][source]: Return pa.RecordBatchReader for each split.

sorted_columns() → list[pyarrow.Field][source]: Return sorted columns’ metadata.

sorting_done() → bool[source]: Sorting done indicator for the table. Always False for unsorted tables.

sorting_score() → int[source]: Sorting score for the table. Always 0 for unsorted tables.

property stats: TableStats | None: Table’s statistics.

update(rows: pyarrow.RecordBatch | pyarrow.Table, columns: list[str] | None = None) → None[source]

Update a subset of cells in this table.

Row IDs are specified using a special field (named “$row_id” of uint64 type) - this function assume that this special field is part of arguments.

A subset of columns to be updated can be specified via the columns argument.

property vector_index: VectorIndex | None: Table’s Vector Index if exists.

vector_search(vec: list[float], columns: list[str], limit: int, predicate: ibis.expr.types.BooleanColumn | ibis.common.deferred.Deferred | None = None) → pyarrow.RecordBatchReader[source]: Vector Search over vector indexed columns.