Migrating from Pandas to Moltres
This guide helps you transition from Pandas to Moltres, highlighting key differences and migration patterns.
Why Migrate?
Memory efficiency: Moltres executes operations in SQL, avoiding data loading into memory
Large datasets: Handle datasets larger than available RAM
Database-native: Work directly with database tables without exporting/importing
SQL pushdown: Leverage database optimizations for better performance
Key Differences
Pandas |
Moltres |
|---|---|
In-memory operations |
SQL pushdown execution |
|
|
|
|
|
|
Eager execution |
Lazy evaluation (call |
Migration Patterns
1. Reading Data
Pandas:
import pandas as pd
# Read from CSV
# df = pd.read_csv("data.csv") # Requires data.csv file
# Read from SQL
# df = pd.read_sql("SELECT * FROM users", connection) # Requires database connection
Moltres:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
db = connect("sqlite:///:memory:")
# Create table first
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Read from CSV (returns Records, not DataFrame)
# Note: Requires data.csv file to exist
# records = Records.from_csv("data.csv", database=db)
# records.insert_into("users")
# Insert sample data instead
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# Read from SQL table (returns DataFrame)
df = db.table("users").select()
results = df.collect() # Returns list of dicts by default
# Or get as pandas DataFrame (requires: pip install pandas)
# results = df.collect(format="pandas") # Returns pd.DataFrame
2. Filtering
Pandas:
import pandas as pd
# Assume df is already loaded
# df = pd.read_csv("data.csv")
# Single condition
df_filtered = df[df['age'] > 25]
# Multiple conditions
df_filtered = df[(df['age'] > 25) & (df['active'] == True)]
# String operations
df_filtered = df[df['email'].str.contains('@example.com')]
Moltres:
from moltres import connect
from moltres import col
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create table and data first
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("age", "INTEGER"),
column("active", "INTEGER"),
column("email", "TEXT"),
]).collect()
Records.from_list([
{"id": 1, "name": "Alice", "age": 30, "active": 1, "email": "alice@example.com"},
{"id": 2, "name": "Bob", "age": 25, "active": 0, "email": "bob@example.com"},
], database=db).insert_into("users")
df = db.table("users").select()
# Single condition
df_filtered = df.where(col("age") > 25)
# Multiple conditions
df_filtered = df.where(
(col("age") > 25) & (col("active") == 1)
)
# String operations
df_filtered = df.where(
col("email").like("%@example.com")
)
# Execute to get results
results = df_filtered.collect()
3. Selecting Columns
Pandas:
import pandas as pd
# Assume df is already loaded
# df = pd.read_csv("data.csv")
# Select specific columns
df_selected = df[['id', 'name', 'email']]
# Select with new column
df['age_plus_10'] = df['age'] + 10
Moltres:
from moltres import connect
from moltres import col
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create table and data first
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("email", "TEXT"),
column("age", "INTEGER"),
]).collect()
Records.from_list([
{"id": 1, "name": "Alice", "email": "alice@example.com", "age": 30},
{"id": 2, "name": "Bob", "email": "bob@example.com", "age": 25},
], database=db).insert_into("users")
df = db.table("users").select()
# Select specific columns
df_selected = df.select("id", "name", "email")
# Select with computed column
df_new = df.select(
"*", # All existing columns
(col("age") + 10).alias("age_plus_10")
)
4. GroupBy and Aggregations
Pandas:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create table and data first
db.create_table("sales", [
column("id", "INTEGER", primary_key=True),
column("country", "TEXT"),
column("amount", "REAL"),
]).collect()
Records.from_list([
{"id": 1, "country": "USA", "amount": 100.0},
{"id": 2, "country": "UK", "amount": 200.0},
], database=db).insert_into("sales")
df = db.table("sales").select()
# Simple aggregation
result = df.groupby('country').agg({
'amount': 'sum',
'id': 'count'
})
# Multiple aggregations
result = df.groupby('country').agg({
'amount': ['sum', 'mean', 'max'],
'id': 'count'
})
Moltres:
from moltres import connect
from moltres import col
from moltres.expressions import functions as F
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create table and data first
db.create_table("sales", [
column("id", "INTEGER", primary_key=True),
column("country", "TEXT"),
column("amount", "REAL"),
]).collect()
Records.from_list([
{"id": 1, "country": "USA", "amount": 100.0},
{"id": 2, "country": "UK", "amount": 200.0},
], database=db).insert_into("sales")
df = db.table("sales").select()
# Simple aggregation
result = (
df.group_by("country")
.agg(
F.sum(col("amount")).alias("total_amount"),
F.count(col("id")).alias("count")
)
)
# Multiple aggregations on same column
result = (
df.group_by("country")
.agg(
F.sum(col("amount")).alias("total"),
F.avg(col("amount")).alias("avg"),
F.max(col("amount")).alias("max"),
F.count("*").alias("count")
)
)
5. Joins
Pandas:
import pandas as pd
# Assume df1 and df2 are already loaded
# df1 = pd.read_csv("data1.csv")
# df2 = pd.read_csv("data2.csv")
# Inner join
result = df1.merge(df2, on='key', how='inner')
# Left join
result = df1.merge(df2, on='key', how='left')
# Right join
result = df1.merge(df2, on='key', how='right')
# Outer join
result = df1.merge(df2, on='key', how='outer')
Moltres:
from moltres import connect, col
from moltres.table.schema import column
from moltres.io.records import Records
# Setup: Create tables and data
db = connect("sqlite:///:memory:")
db.create_table("table1", [column("key", "INTEGER"), column("value1", "TEXT")]).collect()
db.create_table("table2", [column("key", "INTEGER"), column("value2", "TEXT")]).collect()
df1 = db.table("table1").select()
df2 = db.table("table2").select()
# Inner join (default)
# Note: Use table-qualified column names to avoid ambiguity
result = df1.join(
df2,
on=[col("table1.key") == col("table2.key")]
)
# Left join
result = df1.join(
df2,
on=[col("table1.key") == col("table2.key")],
how="left"
)
# Right join
result = df1.join(
df2,
on=[col("table1.key") == col("table2.key")],
how="right"
)
# Outer join
result = df1.join(
df2,
on=[col("table1.key") == col("table2.key")],
how="outer"
)
6. Sorting
Pandas:
from moltres import connect
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create table and data first
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("age", "INTEGER"),
column("active", "INTEGER"),
]).collect()
Records.from_list([
{"id": 1, "name": "Alice", "age": 30, "active": 1},
{"id": 2, "name": "Bob", "age": 25, "active": 0},
], database=db).insert_into("users")
df = db.table("users").select()
# Note: sort_values() is pandas syntax. In Moltres, use order_by()
# This example shows pandas syntax for comparison
try:
import pandas as pd
# Convert to pandas first
df_pandas = df.collect(format="pandas")
df_sorted = df_pandas.sort_values('age')
df_sorted = df_pandas.sort_values('age', ascending=False)
df_sorted = df_pandas.sort_values(['active', 'age'], ascending=[False, True])
except ImportError:
print("pandas not installed. Install with: pip install pandas or pip install moltres[pandas]")
Moltres:
from moltres import connect, col
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("age", "INTEGER"),
column("active", "INTEGER"),
]).collect()
Records.from_list([
{"id": 1, "name": "Alice", "age": 30, "active": 1},
{"id": 2, "name": "Bob", "age": 25, "active": 0},
], database=db).insert_into("users")
df = db.table("users").select()
# Ascending
df_sorted = df.order_by("age")
# Descending
df_sorted = df.order_by(col("age").desc())
# Multiple columns
df_sorted = df.order_by(
col("active").desc(),
col("age").asc()
)
7. Adding/Modifying Columns
Pandas:
import pandas as pd
# Assume df is already loaded
# df = pd.read_csv("data.csv")
# Add new column
df['new_col'] = df['col1'] + df['col2']
# Modify existing
df['age'] = df['age'] + 1
# Conditional column
df['category'] = df['age'].apply(lambda x: 'adult' if x >= 18 else 'minor')
Moltres:
from moltres import connect, col
from moltres.expressions import functions as F
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
db.create_table("data", [
column("id", "INTEGER", primary_key=True),
column("col1", "INTEGER"),
column("col2", "INTEGER"),
column("age", "INTEGER"),
]).collect()
Records.from_list([
{"id": 1, "col1": 10, "col2": 20, "age": 25},
{"id": 2, "col1": 15, "col2": 25, "age": 17},
], database=db).insert_into("data")
df = db.table("data").select()
# Add new column (use withColumn or select)
df_new = df.withColumn(
"new_col",
col("col1") + col("col2")
)
# Or in select
df_new = df.select(
"*",
(col("col1") + col("col2")).alias("new_col")
)
# Conditional column
df_new = df.withColumn(
"category",
F.when(col("age") >= 18, "adult").otherwise("minor")
)
8. Working with Results
Pandas:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Note: This example shows pandas syntax
try:
import pandas as pd
# Results are already DataFrames
# df = pd.read_csv("data.csv") # Requires data.csv file
# result = df.groupby('country').sum()
# result is a DataFrame
print("pandas example - requires data.csv file")
except ImportError:
print("pandas not installed. Install with: pip install pandas or pip install moltres[pandas]")
Moltres:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# Results are lists of dicts by default
df = db.table("users").select()
results = df.collect()
# results is: [{'id': 1, 'name': 'Alice'}, ...]
# Get as pandas DataFrame (requires: pip install pandas)
try:
results = df.collect(format="pandas")
# results is: pd.DataFrame
except ImportError:
print("pandas not installed. Install with: pip install pandas or pip install moltres[pandas]")
# Or convert to pandas after
try:
import pandas as pd
results = df.collect()
df_pandas = pd.DataFrame(results)
except ImportError:
print("pandas not installed. Install with: pip install pandas or pip install moltres[pandas]")
Using Pandas-Style Interface
Moltres also provides a pandas-style interface for easier migration:
See also: Pandas interface examples, Pandas/Polars integration examples, and the Pandas Interface Guide
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("age", "INTEGER"),
column("active", "INTEGER"),
column("country", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice", "age": 30, "active": 1, "country": "USA"},
{"id": 2, "name": "Bob", "age": 25, "active": 1, "country": "UK"},
], database=db).insert_into("users")
# Get pandas-style DataFrame (requires: pip install pandas)
try:
df = db.table("users").pandas()
# Use pandas-style methods
df_filtered = df.query('age > 25 and active == 1')
df_grouped = df.groupby('country').agg({'age': 'mean', 'id': 'count'})
# Still need to call collect()
results = df_filtered.collect()
except ImportError:
print("pandas not installed. Install with: pip install pandas or pip install moltres[pandas]")
Handling Large Datasets
Pandas limitation:
import pandas as pd
# This loads entire dataset into memory
# df = pd.read_csv("huge_file.csv") # May fail if file > RAM
# df_filtered = df[df['age'] > 25]
Moltres advantage:
from moltres import connect, col
from moltres.table.schema import column
from moltres.io.records import Records
# Setup database
db = connect("sqlite:///:memory:")
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
column("age", "INTEGER"),
]).collect()
Records.from_list([
{"id": 1, "name": "Alice", "age": 30},
{"id": 2, "name": "Bob", "age": 25},
], database=db).insert_into("users")
# This executes in SQL - no memory loading
df = db.table("users").select().where(col("age") > 25)
results = df.collect() # Only results are in memory
# Or use streaming for very large results (requires async)
import asyncio
from moltres import async_connect
async def stream_large_results():
db = await async_connect("postgresql+asyncpg://user:pass@localhost/db")
df = db.table("users").select().where(col("age") > 25)
async for chunk in await df.collect(stream=True):
process_chunk(chunk)
await db.close()
# asyncio.run(stream_large_results())
Common Migration Challenges
Challenge 1: Eager vs Lazy Execution
Problem: Forgetting to call .collect()
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# Pandas (eager)
df = pd.read_csv("data.csv")
df_filtered = df[df['age'] > 25] # Executes immediately
# Moltres (lazy)
df = db.table("users").select()
df_filtered = df.where(col("age") > 25) # Doesn't execute yet!
results = df_filtered.collect() # Must call collect()
Challenge 2: Boolean Values
Problem: SQLite uses INTEGER (0/1) instead of boolean
from moltres import connect
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Pandas
df[df['active'] == True]
# Moltres (SQLite)
df.where(col("active") == 1) # Use 1 instead of True
# Moltres (PostgreSQL/MySQL)
df.where(col("active") == True) # Can use True
Challenge 3: String Operations
Pandas:
import pandas as pd
# Assume df is already loaded
# df = pd.read_csv("data.csv")
df[df['email'].str.contains('@example.com')]
df['name'].str.upper()
Moltres:
from moltres import connect
from moltres.expressions import functions as F
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
df.where(col("email").like("%@example.com"))
df.select(F.upper(col("name")).alias("name_upper"))
Migration Checklist
[ ] Replace
pd.read_csv()withRecords.from_csv()ordb.table().select()[ ] Replace
df[condition]withdf.where(condition)[ ] Replace
df.groupby()withdf.group_by()[ ] Replace
df.merge()withdf.join()[ ] Add
.collect()calls where results are needed[ ] Update boolean comparisons (use 1/0 for SQLite)
[ ] Replace pandas string methods with Moltres functions
[ ] Test with sample data before migrating production code
Next Steps
Performance: See Performance Optimization Guide
Patterns: Check Common Patterns Guide
Best Practices: Read Best Practices Guide