Version Control & Reproducibility in LanceDB

LanceDB redefines data management for AI/ML workflows with built-in, automatic versioning powered by the Lance columnar format. Every table mutation—appends, updates, deletions, or schema changes — is tracked with zero configuration, enabling:

  • Time-Travel Debugging: Pinpoint production issues by querying historical table states.
  • Atomic Rollbacks: Revert terabyte-scale datasets to any prior version in seconds.
  • ML Reproducibility: Exactly reproduce training snapshots (vectors + metadata).
  • Branching Workflows: Conduct A/B tests on embeddings/models via lightweight table clones.
import lancedb
import pandas as pd
import numpy as np
import pyarrow as pa
from sentence_transformers import SentenceTransformer

# Connect to LanceDB
db = lancedb.connect(
  uri="db://your-project-slug",
  api_key="your-api-key",
  region="us-east-1"
)

# Create a table with initial data
table_name = "quotes_versioning_example"
data = [
    {"id": 1, "author": "Richard", "quote": "Wubba Lubba Dub Dub!"},
    {"id": 2, "author": "Morty", "quote": "Rick, what's going on?"},
    {
        "id": 3,
        "author": "Richard",
        "quote": "I turned myself into a pickle, Morty!",
    },
]

# Define schema
schema = pa.schema(
    [
        pa.field("id", pa.int64()),
        pa.field("author", pa.string()),
        pa.field("quote", pa.string()),
    ]
)

table = db.create_table(table_name, data, schema=schema, mode="overwrite")

# View the initial version
versions = table.list_versions()
print(f"Number of versions after creation: {len(versions)}")
print(f"Current version: {table.version}")

Modifying Data

When you modify data through operations like update or delete, LanceDB automatically creates new versions.

# Make changes to the table
table.delete("author != 'Richard Daniel Sanchez'")
print(f"Number of rows after deletion: {len(table)}")

# Add more data
# Make changes to the table
table.update(where="author='Rick'", values={"author": "Richard Daniel Sanchez"})
rows_after_update = table.count_rows()
print(f"Number of rows after update: {rows_after_update}")

# Add more data
more_data = [
    {
        "id": 4,
        "author": "Richard Daniel Sanchez",
        "quote": "That's the way the news goes!",
    },
    {"id": 5, "author": "Morty", "quote": "Aww geez, Rick!"},
]
table.add(more_data)

# Check versions after modifications
versions = table.list_versions()
version_count_after_mod = len(versions)
version_after_mod = table.version
print(f"Number of versions after modifications: {version_count_after_mod}")
print(f"Current version: {version_after_mod}")

Schema Evolution

LanceDB’s versioning system automatically tracks every schema modification. This is critical when handling evolving embedding models. For example, adding a new vector_minilm column creates a fresh version, enabling seamless A/B testing between embedding generations without recreating the table.

import pyarrow as pa

# Get data from table
df = table.search().limit(5).to_pandas()

# Let's use "all-MiniLM-L6-v2" model to embed the quotes
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# Generate embeddings for each quote and pair with IDs
vectors = model.encode(
    df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True
)
vector_dim = vectors[0].shape[0]
print(f"Vector dimension: {vector_dim}")

# Add IDs to vectors array with proper column names
vectors_with_ids = [
    {"id": i + 1, "vector_minilm": vec.tolist()} for i, vec in enumerate(vectors)
]

# Add vector column and merge data
table.add_columns(
  {"vector_minilm": f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"}
)

table.merge_insert(
  "id"
).when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids)

# Check versions after schema change
versions = table.list_versions()
version_count_after_embed = len(versions)
version_after_embed = table.version
print(f"Number of versions after adding embeddings: {version_count_after_embed}")
print(f"Current version: {version_after_embed}")

# Verify the schema change
# The table should now include a vector_minilm column containing
# embeddings generated by the all-MiniLM-L6-v2 model
print(table.schema)

Rollback to Previous Versions

LanceDB supports fast rollbacks to any previous version without data duplication.

# Let's see all versions
versions = table.list_versions()
for v in versions:
    print(f"Version {v['version']}, created at {v['timestamp']}")

# Let's roll back to before we added the vector column
# We'll use the version after modifications but before adding embeddings
table.restore(version_after_mod)

# Notice we have one more version now, not less!
versions = table.list_versions()
version_count_after_rollback = len(versions)
print(f"Total number of versions after rollback: {version_count_after_rollback}")

Making Changes from Previous Versions

After restoring a table to an earlier version, you can continue making modifications. In this example, we rolled back to a version before adding embeddings. This allows us to experiment with different embedding models and compare their performance. Here’s how to switch to a different model and add new embeddings:

# Let's switch to the all-mpnet-base-v2 model to embed the quotes
model = SentenceTransformer("all-mpnet-base-v2", device="cpu")

# Generate embeddings for each quote and pair with IDs
vectors = model.encode(
    df["quote"].tolist(), convert_to_numpy=True, normalize_embeddings=True
)
vector_dim = vectors[0].shape[0]
print(f"Vector dimension: {vector_dim}")

# Add IDs to vectors array with proper column names
vectors_with_ids = [
    {"id": i + 1, "vector_mpnet": vec.tolist()} for i, vec in enumerate(vectors)
]

# Add vector column and merge data
table.add_columns(
    {"vector_mpnet": f"arrow_cast(NULL, 'FixedSizeList({vector_dim}, Float32)')"}
)

table.merge_insert(
    "id"
).when_matched_update_all().when_not_matched_insert_all().execute(vectors_with_ids)

# Check versions after schema change
versions = table.list_versions()
version_count_after_alter_embed = len(versions)
version_after_alter_embed = table.version
print(
    f"Number of versions after switching model: {version_count_after_alter_embed}"
)
print(f"Current version: {version_after_alter_embed}")

# The table should now include a vector_mpnet column containing
# embeddings generated by the all-mpnet-base-v2 model
print(table.schema)

Delete data from the table

# Go back to the latest version
table.checkout_latest()
# Let's delete data from the table
table.delete("author != 'Richard Daniel Sanchez'")
rows_after_deletion = table.count_rows()
print(f"Number of rows after deletion: {rows_after_deletion}")

Versions:

  1. Create and add data
  2. Update
  3. Update (append)
  4. Schema evolution (add a vector column)
  5. Merge (add embeddings to the vector column)
  6. Restore (version 3)
  7. Schema evolution (add a new vector column)
  8. Merge (add embeddings to the new vector column)
  9. Deletion