Explore the full implementation in this Quickstart guide:

Python Notebook | TypeScript Example

Prerequisite:

Install the LanceDB SDK with your preferred language.

# Install lancedb and the optional datasets package used in the quickstart example
pip install lancedb datasets

1. Connect to LanceDB Cloud/Enterprise

  • For LanceDB Cloud users, the database URI (which starts with db://) and API key can both be retrieved from the LanceDB Cloud UI. For step-by-step instructions, refer to our onboarding tutorial.
  • For LanceDB Enterprise user, please contact our team to obtain your database URI, API key and host_override URL.
import lancedb
import numpy as np
import pyarrow as pa
import os

# Connect to LanceDB Cloud/Enterprise
uri = "db://your-database-uri"
api_key = "your-api-key"
region = "us-east-1"

# (Optional) For LanceDB Enterprise, set the host override to your enterprise endpoint
host_override = os.environ.get("LANCEDB_HOST_OVERRIDE")

db = lancedb.connect(
  uri=uri,
  api_key=api_key,
  region=region,
  host_override=host_override
)

2. Load Dataset

from datasets import load_dataset

# Load a sample dataset from HuggingFace with pre-computed embeddings
sample_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[:1000]")
print(f"Loaded {len(sample_dataset)} samples")
print(f"Sample features: {sample_dataset.features}")
print(f"Column names: {sample_dataset.column_names}")

# Preview the first sample
print(sample_dataset[0])

# Get embedding dimension
vector_dim = len(sample_dataset[0]["keywords_embeddings"])
print(f"Embedding dimension: {vector_dim}")

3. Create a table and ingest data

import pyarrow as pa

# Create a table with the dataset
table_name = "lancedb-cloud-quickstart"
table = db.create_table(table_name, data=sample_dataset, mode="overwrite")

# Convert list to fixedsizelist on the vector column
table.alter_columns(dict(path="keywords_embeddings", data_type=pa.list_(pa.float32(), vector_dim)))
print(f"Table '{table_name}' created successfully")

4. Create a vector index

from datetime import timedelta

# Create a vector index and wait for it to complete
table.create_index("cosine", vector_column_name="keywords_embeddings", wait_timeout=timedelta(seconds=120))
print(table.index_stats("keywords_embeddings_idx"))

The create_index/createIndex operation executes asynchronously in LanceDB Cloud/Enterprise. To ensure the index is fully built, you can use the wait_timeout parameter or call wait_for_index on the table.

5. Perform a vector search

query_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[5000:5001]")
print(f"Query keywords: {query_dataset[0]['keywords']}")
query_embed = query_dataset["keywords_embeddings"][0]

# A vector search
result = (
    table.search(query_embed)
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Search results:")
print(result)

# A vector search with a filter
filtered_result = (
    table.search(query_embed)
    .where("label > 2")
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Filtered search results (label > 2):")
print(filtered_result)

6. Drop the table

db.drop_table(table_name)

Next Steps