We support high-throughput writes, comfortably handling 4GB per second.
Our client SDK maintains 1:1 parity with the open-source version,
enabling existing users to migrate seamlessly—zero refactoring required.
LanceDB supports table creation using multiple data formats, including:
- Pandas DataFrames (example below)
- Polars DataFrames
- Apache Arrow Tables
For the Python SDK, you can also define tables flexibly using:
- PyArrow schemas (for explicit schema control)
LanceModel
(a Pydantic-based model for structured data validation and serialization)
This ensures compatibility with modern data workflows while maintaining performance and type safety.
Insert data
import lancedb
import pyarrow as pa
# connect to LanceDB Cloud
db = lancedb.connect(
uri="db://your-project-slug",
api_key="your-api-key",
region="us-east-1"
)
# create an empty table with schema
data = [
{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
{"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
{"vector": [10.2, 100.8], "item": "baz", "price": 30.0},
{"vector": [1.4, 9.5], "item": "fred", "price": 40.0},
]
schema = pa.schema([
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
])
table_name = "basic_ingestion_example"
table = db.create_table(table_name, schema=schema, mode="overwrite")
table.add(data)
The vector column needs to be a pyarrow.FixedSizeList type.
Using Pydantic Models
from lancedb.pydantic import Vector, LanceModel
import pyarrow as pa
# Define a Pydantic model
class Content(LanceModel):
movie_id: int
vector: Vector(128)
genres: str
title: str
imdb_id: int
@property
def imdb_url(self) -> str:
return f"https://www.imdb.com/title/tt{self.imdb_id}"
# Create table with Pydantic model schema
table_name = "pydantic_example"
table = db.create_table(table_name, schema=Content, mode="overwrite")
Using Nested Models
You can use nested Pydantic models to represent complex data structures.
For example, you may want to store the document string and the document source name as a nested Document object:
from pydantic import BaseModel
class Document(BaseModel):
content: str
source: str
This can be used as the type of a LanceDB table column:
class NestedSchema(LanceModel):
id: str
vector: Vector(128)
document: Document
# Create table with nested schema
table_name = "nested_model_example"
table = db.create_table(table_name, schema=NestedSchema, mode="overwrite")
This creates a struct column called document
that has two subfields called content
and source
:
In [28]: table.schema
Out[28]:
id: string not null
vector: fixed_size_list<item: float>[128] not null
child 0, item: float
document: struct<content: string not null, source: string not null> not null
child 0, content: string not null
child 1, source: string not null
Insert large datasets
It is recommended to use itertators to add large datasets in batches when creating
your table in one go. Data will be automatically compacted for the best query performance.
import pyarrow as pa
def make_batches():
for i in range(5): # Create 3 batches
yield pa.RecordBatch.from_arrays(
[
pa.array([[3.1, 4.1], [5.9, 26.5]],
pa.list_(pa.float32(), 2)),
pa.array([f"item{i*2+1}", f"item{i*2+2}"]),
pa.array([float((i*2+1)*10), float((i*2+2)*10)]),
],
["vector", "item", "price"],
)
schema = pa.schema([
pa.field("vector", pa.list_(pa.float32(), 2)),
pa.field("item", pa.utf8()),
pa.field("price", pa.float32()),
])
# Create table with batches
table_name = "batch_ingestion_example"
table = db.create_table(table_name, make_batches(), schema=schema, mode="overwrite")
Explore full documentation in our SDK guides: Python and Typescript.