Explore the full implementation in this Quickstart guide:

Python Notebook | TypeScript Example

Prerequisite:

Install the LanceDB SDK with your preferred language

pip install lancedb

1. Connect to LanceDB Cloud/Enterprise

  • For LanceDB Cloud users, the database URI (which starts with db://) and API key can both be retrieved from the LanceDB Cloud UI. For step-by-step instructions, refer to our onboarding tutorial.
  • For LanceDB Enterprise user, please contact our team to obtain your database URI, API key and host_override URL.
import lancedb
import numpy as np
import pyarrow as pa
import os

# Connect to LanceDB Cloud/Enterprise
uri = "db://your-database-uri"
api_key = "your-api-key"
region = "your-region" # this is "us-east-1" for LanceDB Cloud users

# (Optional) For LanceDB Enterprise, set the host override to your enterprise endpoint
host_override = os.environ.get("LANCEDB_HOST_OVERRIDE")

db = lancedb.connect(
  uri=uri,
  api_key=api_key,
  region=region,
  host_override=host_override # this should be ignored for LanceDB Cloud users
)

2. Load Dataset

from datasets import load_dataset

# Load a sample dataset from HuggingFace with pre-computed embeddings
sample_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[:1000]")
print(f"Loaded {len(sample_dataset)} samples")
print(f"Sample features: {sample_dataset.features}")
print(f"Column names: {sample_dataset.column_names}")

# Preview the first sample
print(sample_dataset[0])

# Get embedding dimension
vector_dim = len(sample_dataset[0]["keywords_embeddings"])
print(f"Embedding dimension: {vector_dim}")

3. Create a table and ingest data

import pyarrow as pa

# Create a table with the dataset
table_name = "lancedb-cloud-quickstart"
table = db.create_table(table_name, data=sample_dataset, mode="overwrite")

# Convert list to fixedsizelist on the vector column
table.alter_columns(dict(path="keywords_embeddings", data_type=pa.list_(pa.float32(), vector_dim)))
print(f"Table '{table_name}' created successfully")
# Create a vector index
table.create_index("cosine", vector_column_name="keywords_embeddings")

# Wait for the index to be ready
import time

def wait_for_index(table, index_name):
  POLL_INTERVAL = 10
  while True:
      indices = table.list_indices()
      if indices and any(index.name == index_name for index in indices):
          break
      print(f"⏳ Waiting for {index_name} to be ready...")
      time.sleep(POLL_INTERVAL)
  print(f"✅ {index_name} is ready!")

index_name = "keywords_embeddings_idx"
wait_for_index(table, index_name)
print(table.index_stats(index_name))

The create_index/createIndex executes asynchronously. To ensure the index is fully built, implement the status verification demonstrated above to monitor the index build progress.

5. Perform a vector search

query_dataset = load_dataset("sunhaozhepy/ag_news_sbert_keywords_embeddings", split="test[5000:5001]")
print(f"Query keywords: {query_dataset[0]['keywords']}")
query_embed = query_dataset["keywords_embeddings"][0]

# A vector search
result = (
    table.search(query_embed)
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Search results:")
print(result)

# A vector search with a filter
filtered_result = (
    table.search(query_embed)
    .where("label > 2")
    .select(["text", "keywords", "label"])
    .limit(5)
    .to_pandas()
)
print("Filtered search results (label > 2):")
print(filtered_result)

6. Drop the table

db.drop_table(table_name)

Next Steps