LanceDB redefines data management for AI/ML workflows with built-in,
automatic versioning powered by the Lance columnar format.
Every table mutation—appends, updates, deletions, or schema changes — is tracked with
zero configuration, enabling:
Time-Travel Debugging: Pinpoint production issues by querying historical table states.
Atomic Rollbacks: Revert terabyte-scale datasets to any prior version in seconds.
ML Reproducibility: Exactly reproduce training snapshots (vectors + metadata).
Branching Workflows: Conduct A/B tests on embeddings/models via lightweight table clones.
import lancedbimport pandas as pdimport numpy as npimport pyarrow as pafrom sentence_transformers import SentenceTransformer# Connect to LanceDBdb = lancedb.connect( uri="db://your-project-slug", api_key="your-api-key", region="us-east-1")# Create a table with initial datatable_name ="quotes_versioning_example"data =[{"id":1,"author":"Richard","quote":"Wubba Lubba Dub Dub!"},{"id":2,"author":"Morty","quote":"Rick, what's going on?"},{"id":3,"author":"Richard","quote":"I turned myself into a pickle, Morty!",},]# Define schemaschema = pa.schema([ pa.field("id", pa.int64()), pa.field("author", pa.string()), pa.field("quote", pa.string()),])table = db.create_table(table_name, data, schema=schema, mode="overwrite")# View the initial versionversions = table.list_versions()print(f"Number of versions after creation: {len(versions)}")print(f"Current version: {table.version}")
When you modify data through operations like update or delete, LanceDB automatically creates new versions.
# Make changes to the tabletable.delete("author != 'Richard Daniel Sanchez'")print(f"Number of rows after deletion: {len(table)}")# Add more data# Make changes to the tabletable.update(where="author='Rick'", values={"author":"Richard Daniel Sanchez"})rows_after_update = table.count_rows()print(f"Number of rows after update: {rows_after_update}")# Add more datamore_data =[{"id":4,"author":"Richard Daniel Sanchez","quote":"That's the way the news goes!",},{"id":5,"author":"Morty","quote":"Aww geez, Rick!"},]table.add(more_data)# Check versions after modificationsversions = table.list_versions()version_count_after_mod =len(versions)version_after_mod = table.versionprint(f"Number of versions after modifications: {version_count_after_mod}")print(f"Current version: {version_after_mod}")
LanceDB’s versioning system automatically tracks
every schema modification. This is critical when handling
evolving embedding models. For example, adding a new
vector_minilm column creates a fresh version, enabling seamless A/B testing
between embedding generations without recreating the table.
import pyarrow as pa# Get data from tabledf = table.search().limit(5).to_pandas()# Let's use "all-MiniLM-L6-v2" model to embed the quotesmodel = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")# Generate embeddings for each quote and pair with IDsvectors = model.encode( df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True)vector_dim = vectors[0].shape[0]print(f"Vector dimension: {vector_dim}")# Add IDs to vectors array with proper column namesvectors_with_ids =[{"id": i +1,"vector_minilm": vec.tolist()}for i, vec inenumerate(vectors)]# Add vector column and merge datatable.add_columns({"vector_minilm":f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"})table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids)# Check versions after schema changeversions = table.list_versions()version_count_after_embed =len(versions)version_after_embed = table.versionprint(f"Number of versions after adding embeddings: {version_count_after_embed}")print(f"Current version: {version_after_embed}")# Verify the schema change# The table should now include a vector_minilm column containing# embeddings generated by the all-MiniLM-L6-v2 modelprint(table.schema)
LanceDB supports fast rollbacks to any previous version without data duplication.
# Let's see all versionsversions = table.list_versions()for v in versions:print(f"Version {v['version']}, created at {v['timestamp']}")# Let's roll back to before we added the vector column# We'll use the version after modifications but before adding embeddingstable.restore(version_after_mod)# Notice we have one more version now, not less!versions = table.list_versions()version_count_after_rollback =len(versions)print(f"Total number of versions after rollback: {version_count_after_rollback}")
After restoring a table to an earlier version, you can continue making modifications. In this example,
we rolled back to a version before adding embeddings. This allows us to experiment with different
embedding models and compare their performance. Here’s how to switch to a different model and add new embeddings:
# Let's switch to the all-mpnet-base-v2 model to embed the quotesmodel = SentenceTransformer("all-mpnet-base-v2", device="cpu")# Generate embeddings for each quote and pair with IDsvectors = model.encode( df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True)vector_dim = vectors[0].shape[0]print(f"Vector dimension: {vector_dim}")# Add IDs to vectors array with proper column namesvectors_with_ids =[{"id": i +1,"vector_mpnet": vec.tolist()}for i, vec inenumerate(vectors)]# Add vector column and merge datatable.add_columns({"vector_mpnet":f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"})table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids)# Check versions after schema changeversions = table.list_versions()version_count_after_alter_embed =len(versions)version_after_alter_embed = table.versionprint(f"Number of versions after switching model: {version_count_after_alter_embed}")print(f"Current version: {version_after_alter_embed}")# The table should now include a vector_mpnet column containing# embeddings generated by the all-mpnet-base-v2 modelprint(table.schema)
# Go back to the latest versiontable.checkout_latest()# Let's delete data from the tabletable.delete("author != 'Richard Daniel Sanchez'")rows_after_deletion = table.count_rows()print(f"Number of rows after deletion: {rows_after_deletion}")