Ingest data - LanceDB

We support high-throughput writes, comfortably handling 4GB per second. Our client SDK maintains 1:1 parity with the open-source version, enabling existing users to migrate seamlessly—zero refactoring required.

LanceDB supports table creation using multiple data formats, including:

Pandas DataFrames (example below)
Polars DataFrames
Apache Arrow Tables

For the Python SDK, you can also define tables flexibly using:

PyArrow schemas (for explicit schema control)
LanceModel (a Pydantic-based model for structured data validation and serialization)

This ensures compatibility with modern data workflows while maintaining performance and type safety.

Insert data

import lancedb
import pyarrow as pa

# connect to LanceDB Cloud
db = lancedb.connect(
  uri="db://your-project-slug",
  api_key="your-api-key",
  region="us-east-1"
)

# create an empty table with schema
data = [
    {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
    {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
    {"vector": [10.2, 100.8], "item": "baz", "price": 30.0},
    {"vector": [1.4, 9.5], "item": "fred", "price": 40.0},
]

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])

table_name = "basic_ingestion_example"
table = db.create_table(table_name, schema=schema, mode="overwrite")
table.add(data)

The vector column needs to be a pyarrow.FixedSizeList type.

Using Pydantic Models

from lancedb.pydantic import Vector, LanceModel

import pyarrow as pa

# Define a Pydantic model
class Content(LanceModel):
    movie_id: int
    vector: Vector(128)
    genres: str
    title: str
    imdb_id: int

    @property
    def imdb_url(self) -> str:
        return f"https://www.imdb.com/title/tt{self.imdb_id}"

# Create table with Pydantic model schema
table_name = "pydantic_example"
table = db.create_table(table_name, schema=Content, mode="overwrite")

Using Nested Models

You can use nested Pydantic models to represent complex data structures. For example, you may want to store the document string and the document source name as a nested Document object:

from pydantic import BaseModel

class Document(BaseModel):
    content: str
    source: str

This can be used as the type of a LanceDB table column:

class NestedSchema(LanceModel):
    id: str
    vector: Vector(128)
    document: Document

# Create table with nested schema
table_name = "nested_model_example"
table = db.create_table(table_name, schema=NestedSchema, mode="overwrite")

This creates a struct column called document that has two subfields called content and source:

In [28]: table.schema
Out[28]:
id: string not null
vector: fixed_size_list<item: float>[128] not null
    child 0, item: float
document: struct<content: string not null, source: string not null> not null
    child 0, content: string not null
    child 1, source: string not null

Insert large datasets

It is recommended to use itertators to add large datasets in batches when creating your table in one go. Data will be automatically compacted for the best query performance.

import pyarrow as pa

def make_batches():
    for i in range(5):  # Create 3 batches
        yield pa.RecordBatch.from_arrays(
            [
                pa.array([[3.1, 4.1], [5.9, 26.5]],
                        pa.list_(pa.float32(), 2)),
                pa.array([f"item{i*2+1}", f"item{i*2+2}"]),
                pa.array([float((i*2+1)*10), float((i*2+2)*10)]),
            ],
            ["vector", "item", "price"],
        )

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])
# Create table with batches
table_name = "batch_ingestion_example"
table = db.create_table(table_name, make_batches(), schema=schema, mode="overwrite")

Explore full documentation in our SDK guides: Python and Typescript.

On this page

Insert data
Using Pydantic Models
Using Nested Models
Insert large datasets

LanceDB supports table creation using multiple data formats, including:

Pandas DataFrames (example below)
Polars DataFrames
Apache Arrow Tables

For the Python SDK, you can also define tables flexibly using:

PyArrow schemas (for explicit schema control)
LanceModel (a Pydantic-based model for structured data validation and serialization)

This ensures compatibility with modern data workflows while maintaining performance and type safety.

Insert data

import lancedb
import pyarrow as pa

# connect to LanceDB Cloud
db = lancedb.connect(
  uri="db://your-project-slug",
  api_key="your-api-key",
  region="us-east-1"
)

# create an empty table with schema
data = [
    {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
    {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
    {"vector": [10.2, 100.8], "item": "baz", "price": 30.0},
    {"vector": [1.4, 9.5], "item": "fred", "price": 40.0},
]

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])

table_name = "basic_ingestion_example"
table = db.create_table(table_name, schema=schema, mode="overwrite")
table.add(data)

The vector column needs to be a pyarrow.FixedSizeList type.

Using Pydantic Models

from lancedb.pydantic import Vector, LanceModel

import pyarrow as pa

# Define a Pydantic model
class Content(LanceModel):
    movie_id: int
    vector: Vector(128)
    genres: str
    title: str
    imdb_id: int

    @property
    def imdb_url(self) -> str:
        return f"https://www.imdb.com/title/tt{self.imdb_id}"

# Create table with Pydantic model schema
table_name = "pydantic_example"
table = db.create_table(table_name, schema=Content, mode="overwrite")

Using Nested Models

You can use nested Pydantic models to represent complex data structures. For example, you may want to store the document string and the document source name as a nested Document object:

from pydantic import BaseModel

class Document(BaseModel):
    content: str
    source: str

This can be used as the type of a LanceDB table column:

class NestedSchema(LanceModel):
    id: str
    vector: Vector(128)
    document: Document

# Create table with nested schema
table_name = "nested_model_example"
table = db.create_table(table_name, schema=NestedSchema, mode="overwrite")

This creates a struct column called document that has two subfields called content and source:

In [28]: table.schema
Out[28]:
id: string not null
vector: fixed_size_list<item: float>[128] not null
    child 0, item: float
document: struct<content: string not null, source: string not null> not null
    child 0, content: string not null
    child 1, source: string not null

Insert large datasets

It is recommended to use itertators to add large datasets in batches when creating your table in one go. Data will be automatically compacted for the best query performance.

import pyarrow as pa

def make_batches():
    for i in range(5):  # Create 3 batches
        yield pa.RecordBatch.from_arrays(
            [
                pa.array([[3.1, 4.1], [5.9, 26.5]],
                        pa.list_(pa.float32(), 2)),
                pa.array([f"item{i*2+1}", f"item{i*2+2}"]),
                pa.array([float((i*2+1)*10), float((i*2+2)*10)]),
            ],
            ["vector", "item", "price"],
        )

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])
# Create table with batches
table_name = "batch_ingestion_example"
table = db.create_table(table_name, make_batches(), schema=schema, mode="overwrite")

Explore full documentation in our SDK guides: Python and Typescript.

On this page

Insert data
Using Pydantic Models
Using Nested Models
Insert large datasets

​Insert data

​Using Pydantic Models

​Using Nested Models

​Insert large datasets

Changelog

​Insert data

​Using Pydantic Models

​Using Nested Models

​Insert large datasets

Insert data

Using Pydantic Models

Using Nested Models

Insert large datasets

Insert data

Using Pydantic Models

Using Nested Models

Insert large datasets