Performance Optimization Guide
Learn how to write efficient Moltres queries and optimize performance.
Understanding Moltres Performance
Moltres compiles DataFrame operations into SQL and executes them on your database. Performance depends on:
Database optimizer: Your database’s query planner
Query structure: How operations are composed
Indexes: Database indexes on columns used in filters/joins
Connection pooling: Efficient connection management
Result size: Amount of data returned
Best Practices
1. Use Indexes
Create indexes on columns used in WHERE clauses, JOINs, and ORDER BY:
from moltres import connect
db = connect("postgresql://user:pass@localhost/mydb")
# Create index on frequently filtered column
db.create_index("idx_user_email", "users", "email").collect()
# Create composite index for multi-column queries
db.create_index(
"idx_user_country_active",
"users",
["country", "active"]
).collect()
When to index:
Columns in WHERE clauses
JOIN keys
Columns in ORDER BY
Columns in GROUP BY (sometimes)
2. Filter Early
Apply filters as early as possible in your query chain:
from moltres import connect
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# ❌ Bad: Filters large dataset after join
df = (
db.table("orders")
.select()
.join(db.table("users").select(), on=[...])
.where(col("orders.amount") > 1000) # Filter after join
)
# ✅ Good: Filter before join
df = (
db.table("orders")
.select()
.where(col("amount") > 1000) # Filter early
.join(db.table("users").select(), on=[...])
)
3. Select Only Needed Columns
Avoid selecting all columns when you only need a few:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ❌ Bad: Selects all columns
df = db.table("users").select() # Selects all columns
results = df.collect()
# ✅ Good: Select only needed columns
df = db.table("users").select("id", "name", "email")
results = df.collect()
4. Use LIMIT for Exploration
When exploring data, use LIMIT to avoid loading large result sets:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ✅ Good: Limit results during exploration
df = (
db.table("users")
.select()
.where(col("age") > 25)
.limit(100) # Only get first 100 rows
)
results = df.collect()
5. Optimize Joins
Use appropriate join types:
# ✅ Use INNER JOIN when possible (faster than OUTER)
df = df1.join(df2, on=[...], how="inner")
# ✅ Filter before joining (reduces join size)
df1_filtered = df1.where(col("active") == 1)
df = df1_filtered.join(df2, on=[...])
Ensure join keys are indexed:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create indexes on join keys
db.create_index("idx_orders_user_id", "orders", "user_id").collect()
db.create_index("idx_users_id", "users", "id").collect()
6. Use Aggregations Efficiently
Push aggregations to the database level:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ✅ Good: Aggregation happens in SQL
df = (
db.table("orders")
.select()
.group_by("user_id")
.agg(F.sum(col("amount")).alias("total"))
.limit(100)
)
results = df.collect() # Only 100 aggregated rows returned
# ❌ Bad: Would load all data, then aggregate in Python
# (Moltres doesn't do this, but be aware of the pattern)
7. Connection Pooling
Configure connection pooling for better performance:
db = connect(
"postgresql://user:pass@localhost/mydb",
pool_size=10, # Number of connections to maintain
max_overflow=20, # Additional connections allowed
pool_timeout=30, # Timeout for getting connection
pool_recycle=3600, # Recycle connections after 1 hour
pool_pre_ping=True # Verify connections before use
)
8. Use Streaming for Large Results
For large result sets, use streaming to process in chunks:
# ✅ Good: Stream large results
async def process_large_dataset():
db = await async_connect("postgresql+asyncpg://...")
df = db.table("large_table").select()
async for chunk in await df.collect(stream=True):
# Process chunk (e.g., 1000 rows at a time)
process_chunk(chunk)
await db.close()
9. Batch Operations
For INSERT/UPDATE/DELETE, operations are automatically batched, but you can optimize:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# ✅ Good: Single batch insert
records = Records.from_list(large_list, database=db)
result = records.insert_into("users") # Automatically batched
# ✅ Good: Batch updates
result = db.update(
"users",
where=col("status") == "pending",
set={"status": "active"}
) # Single SQL statement, updates all matching rows
10. Avoid N+1 Queries
Don’t call .collect() in loops:
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ❌ Bad: N+1 queries
user_ids = [1, 2, 3, 4, 5]
results = []
for user_id in user_ids:
df = db.table("orders").select().where(col("user_id") == user_id)
results.append(df.collect()) # One query per iteration!
# ✅ Good: Single query with IN clause
df = db.table("orders").select().where(col("user_id").isin(user_ids))
results = df.collect() # Single query
Database-Specific Optimizations
PostgreSQL
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Use EXPLAIN ANALYZE to understand query plans
plan = df.explain(analyze=True)
print(plan)
# Use PostgreSQL-specific features
# - JSONB for JSON data
# - Array types for arrays
# - Full-text search indexes
MySQL
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Use EXPLAIN to understand query plans
plan = df.explain()
print(plan)
# MySQL-specific optimizations
# - Use InnoDB for transactions
# - Configure buffer pool size
# - Use covering indexes
SQLite
# SQLite is single-threaded, but very fast for small-medium datasets
# Use WAL mode for better concurrency
# db = connect("sqlite:///:memory:") # For testing
# db = connect("sqlite:///example.db?mode=rwc") # For production with WAL mode
# Create indexes (SQLite doesn't auto-index foreign keys)
db.create_index("idx_orders_user_id", "orders", "user_id").collect()
Monitoring Performance
1. Enable SQL Logging
db = connect(
"postgresql://user:pass@localhost/mydb",
echo=True # Log all SQL statements
)
2. Use Performance Hooks
from moltres.engine import register_performance_hook
def log_query(sql: str, elapsed: float, metadata: dict):
if elapsed > 1.0: # Log slow queries
print(f"Slow query ({elapsed:.2f}s): {sql[:200]}")
register_performance_hook("query_end", log_query)
3. Check Query Plans
from moltres import connect
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Get estimated plan
plan = df.explain()
print(plan)
# Get actual execution stats (PostgreSQL)
plan = df.explain(analyze=True)
print(plan)
Common Performance Pitfalls
1. Loading Entire Tables
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ❌ Bad: Loads entire table
df = db.table("huge_table").select()
results = df.collect() # May be millions of rows!
# ✅ Good: Filter first
df = db.table("huge_table").select().where(col("date") >= "2024-01-01")
results = df.collect()
2. Multiple Collect Calls
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ❌ Bad: Multiple queries
df = db.table("users").select()
count = len(df.collect()) # Query 1
filtered = df.where(col("age") > 25).collect() # Query 2
# ✅ Good: Single query with aggregation
df = db.table("users").select()
count_df = df.select(F.count("*").alias("count"))
count = count_df.collect()[0]["count"]
filtered = df.where(col("age") > 25).collect()
3. Unnecessary Joins
from moltres import connect
from moltres.table.schema import column
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# ❌ Bad: Join when not needed
df = (
db.table("orders")
.select()
.join(db.table("users").select(), on=[...])
.select("orders.id", "orders.amount") # Only using orders columns
)
# ✅ Good: No join needed
df = db.table("orders").select("id", "amount")
4. Over-aggregation
from moltres import connect
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
# ❌ Bad: Aggregating then filtering
df = (
db.table("orders")
.select()
.group_by("user_id")
.agg(F.sum(col("amount")).alias("total"))
)
results = [r for r in df.collect() if r["total"] > 1000] # Filter in Python
# ✅ Good: Filter in SQL using HAVING
df = (
db.table("orders")
.select()
.group_by("user_id")
.agg(F.sum(col("amount")).alias("total"))
.having(col("total") > 1000)
)
results = df.collect()
Performance Testing
Benchmark Your Queries
from moltres import connect
import time
from moltres.table.schema import column
from moltres.io.records import Records
# Use in-memory SQLite for easy setup (no file needed)
db = connect("sqlite:///:memory:")
# Create sample table
db.create_table("users", [
column("id", "INTEGER", primary_key=True),
column("name", "TEXT"),
]).collect()
# Insert sample data
Records.from_list([
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
], database=db).insert_into("users")
def benchmark_query(df):
start = time.perf_counter()
results = df.collect()
elapsed = time.perf_counter() - start
print(f"Query took {elapsed:.2f}s, returned {len(results)} rows")
return results
# Test different approaches
df1 = db.table("users").select().where(col("age") > 25)
df2 = db.table("users").select("id", "name").where(col("age") > 25)
results1 = benchmark_query(df1)
results2 = benchmark_query(df2)
Next Steps
Patterns: See Common Patterns Guide for optimized patterns
Best Practices: Read Best Practices Guide
Troubleshooting: Check Troubleshooting Guide for performance issues