Skip to content

Database Schema Documentation

This document describes the database schema used in the DSTA project for storing cryptocurrency market data.

Overview

The DSTA system uses PostgreSQL 17 with Django ORM to store historical market data, trade information, and market metrics. The schema is optimized for time-series queries and high-volume data ingestion.

Design Principles

  1. Time-Series Optimized: Indexes and constraints designed for efficient time-range queries
  2. Exchange Agnostic: Supports multiple exchanges (Binance, Huobi, Gate.io)
  3. Data Integrity: Unique constraints prevent duplicate data
  4. Scalability: Prepared for millions of records with appropriate indexing
  5. Decimal Precision: Uses DECIMAL type for accurate financial calculations

Supported Exchanges

SUPPORTED_EXCHANGES = ('gateio', 'binance', 'huobi')

Supported Intervals

VALID_INTERVALS = ('10s', '1m', '2m', '5m', '15m', '30m', '1h', '4h', '1d', '7d')

Entity Relationship Diagram

erDiagram
    CANDLESTICK {
        bigint id PK
        varchar exchange
        varchar symbol
        varchar interval
        datetime timestamp
        decimal open_price
        decimal high_price
        decimal low_price
        decimal close_price
        decimal volume
        decimal quote_volume
        int trades_count
        datetime created_at
        datetime updated_at
    }

    TRADE {
        bigint id PK
        varchar exchange
        varchar symbol
        varchar trade_id
        datetime timestamp
        decimal price
        decimal quantity
        decimal quote_quantity
        boolean is_buyer_maker
        datetime created_at
    }

    ORDER_BOOK {
        bigint id PK
        varchar exchange
        varchar symbol
        datetime timestamp
        json bids
        json asks
        bigint last_update_id
        datetime created_at
    }

    MARKET_METRIC {
        bigint id PK
        varchar metric_type
        datetime timestamp
        decimal value
        json metadata
        datetime created_at
    }

Table Descriptions

1. Candlestick Table

Table Name: candlesticks

Purpose: Store OHLCV (Open, High, Low, Close, Volume) candlestick data for cryptocurrency trading pairs.

Schema:

Column Type Nullable Description
id BIGINT NO Auto-incrementing primary key
exchange VARCHAR(20) NO Exchange name (choices: binance, huobi, gateio)
symbol VARCHAR(20) NO Trading pair symbol (e.g., 'BTCUSDT')
interval VARCHAR(5) NO Timeframe (10s, 1m, 5m, 15m, 30m, 1h, 4h, 1d, 7d)
timestamp TIMESTAMP NO Opening time of the candlestick (UTC)
open_price DECIMAL(20,8) NO Opening price (min: 0.00000001)
high_price DECIMAL(20,8) NO Highest price during interval
low_price DECIMAL(20,8) NO Lowest price during interval
close_price DECIMAL(20,8) NO Closing price
volume DECIMAL(30,8) NO Trading volume in base currency
quote_volume DECIMAL(30,8) YES Trading volume in quote currency
trades_count INTEGER YES Number of trades during interval
created_at TIMESTAMP NO Record creation time (auto)
updated_at TIMESTAMP NO Record last update time (auto)

Indexes:

-- Composite index for time-series queries
CREATE INDEX candle_lookup_idx ON candlesticks (exchange, symbol, interval, timestamp);

-- Symbol-specific time queries
CREATE INDEX symbol_time_idx ON candlesticks (symbol, timestamp);

-- Exchange-wide queries
CREATE INDEX exchange_time_idx ON candlesticks (exchange, timestamp);

Constraints:

-- Prevent duplicate candlesticks
ALTER TABLE candlesticks ADD CONSTRAINT unique_candlestick 
    UNIQUE (exchange, symbol, interval, timestamp);

Usage Example:

from api.models import Candlestick
from decimal import Decimal
from datetime import datetime, timezone

# Insert candlestick
candle = Candlestick.objects.create(
    exchange='binance',
    symbol='BTCUSDT',
    interval='1h',
    timestamp=datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc),
    open_price=Decimal('42000.50'),
    high_price=Decimal('42500.75'),
    low_price=Decimal('41800.25'),
    close_price=Decimal('42300.00'),
    volume=Decimal('123.456'),
    quote_volume=Decimal('5200000.00'),
    trades_count=1523
)

# Query candlesticks
btc_hourly = Candlestick.objects.filter(
    exchange='binance',
    symbol='BTCUSDT',
    interval='1h',
    timestamp__gte=datetime(2024, 1, 1, tzinfo=timezone.utc),
    timestamp__lt=datetime(2024, 1, 2, tzinfo=timezone.utc)
).order_by('timestamp')

2. Trade Table

Table Name: trades

Purpose: Store individual trade execution records from exchanges for granular market analysis.

Schema:

Column Type Nullable Description
id BIGINT NO Auto-incrementing primary key
exchange VARCHAR(20) NO Exchange where trade occurred
symbol VARCHAR(20) NO Trading pair symbol
trade_id VARCHAR(50) NO Unique trade ID from exchange
timestamp TIMESTAMP NO Trade execution time (UTC)
price DECIMAL(20,8) NO Trade price
quantity DECIMAL(30,8) NO Trade quantity in base currency
quote_quantity DECIMAL(30,8) YES Trade quantity in quote currency
is_buyer_maker BOOLEAN YES True if buyer was maker, False if seller
created_at TIMESTAMP NO Record creation time (auto)

Indexes:

-- Primary lookup index
CREATE INDEX trade_lookup_idx ON trades (exchange, symbol, timestamp);

-- Symbol-specific queries
CREATE INDEX trade_symbol_time_idx ON trades (symbol, timestamp);

Constraints:

-- Prevent duplicate trades
ALTER TABLE trades ADD CONSTRAINT unique_trade 
    UNIQUE (exchange, symbol, trade_id);

Usage Example:

from api.models import Trade
from decimal import Decimal
from datetime import datetime, timezone, timedelta

# Insert trade
trade = Trade.objects.create(
    exchange='binance',
    symbol='ETHUSDT',
    trade_id='12345678',
    timestamp=datetime(2024, 1, 1, 12, 30, 0, tzinfo=timezone.utc),
    price=Decimal('3000.50'),
    quantity=Decimal('2.5'),
    quote_quantity=Decimal('7501.25'),
    is_buyer_maker=True
)

# Query recent trades
recent_trades = Trade.objects.filter(
    exchange='binance',
    symbol='ETHUSDT',
    timestamp__gte=datetime.now(timezone.utc) - timedelta(hours=1)
).order_by('-timestamp')

3. OrderBook Table

Table Name: order_books

Purpose: Store order book snapshot data for support/resistance level analysis and market depth studies.

Schema:

Column Type Nullable Description
id BIGINT NO Auto-incrementing primary key
exchange VARCHAR(20) NO Exchange where order book is from
symbol VARCHAR(20) NO Trading pair symbol
timestamp TIMESTAMP NO Snapshot timestamp (UTC)
bids JSON NO Array of [price, quantity] bid levels
asks JSON NO Array of [price, quantity] ask levels
last_update_id BIGINT YES Last update ID from exchange
created_at TIMESTAMP NO Record creation time (auto)

Indexes:

-- Primary lookup index
CREATE INDEX orderbook_lookup_idx ON order_books (exchange, symbol, timestamp);

-- Symbol-specific queries
CREATE INDEX orderbook_symbol_time_idx ON order_books (symbol, timestamp);

-- Time-based queries
CREATE INDEX orderbook_time_idx ON order_books (timestamp);

JSON Structure:

{
  "bids": [
    ["42000.00", "1.5"],
    ["41999.00", "2.3"],
    ["41998.00", "0.8"]
  ],
  "asks": [
    ["42001.00", "1.2"],
    ["42002.00", "2.1"],
    ["42003.00", "1.7"]
  ]
}

Usage Example:

from api.models import OrderBook
from datetime import datetime, timezone

# Insert order book snapshot
orderbook = OrderBook.objects.create(
    exchange='binance',
    symbol='BTCUSDT',
    timestamp=datetime.now(timezone.utc),
    bids=[
        ['42000.00', '1.5'],
        ['41999.00', '2.3'],
        ['41998.00', '0.8']
    ],
    asks=[
        ['42001.00', '1.2'],
        ['42002.00', '2.1'],
        ['42003.00', '1.7']
    ],
    last_update_id=123456789
)

# Get latest order book
latest = OrderBook.objects.filter(
    exchange='binance',
    symbol='BTCUSDT'
).order_by('-timestamp').first()

# Calculate spread
best_bid = Decimal(latest.bids[0][0])
best_ask = Decimal(latest.asks[0][0])
spread = best_ask - best_bid

4. MarketMetric Table

Table Name: market_metrics

Purpose: Store market-wide metrics like BTC dominance, total market cap, and Fear & Greed Index.

Schema:

Column Type Nullable Description
id BIGINT NO Auto-incrementing primary key
metric_type VARCHAR(30) NO Type of metric (see choices below)
timestamp TIMESTAMP NO Metric timestamp (UTC)
value DECIMAL(20,8) NO Metric value
metadata JSON YES Additional metadata
created_at TIMESTAMP NO Record creation time (auto)

Metric Types:

  • btc_dominance: Bitcoin market dominance percentage
  • total_market_cap: Total cryptocurrency market capitalization
  • fear_greed_index: Fear & Greed Index (0-100)
  • other: Custom metrics

Indexes:

-- Primary lookup index
CREATE INDEX metric_lookup_idx ON market_metrics (metric_type, timestamp);

Constraints:

-- Prevent duplicate metrics
ALTER TABLE market_metrics ADD CONSTRAINT unique_market_metric 
    UNIQUE (metric_type, timestamp);

Usage Example:

from api.models import MarketMetric
from decimal import Decimal
from datetime import datetime, timezone

# Insert BTC dominance
btc_dom = MarketMetric.objects.create(
    metric_type='btc_dominance',
    timestamp=datetime.now(timezone.utc),
    value=Decimal('45.67'),
    metadata={'source': 'coinmarketcap'}
)

# Insert Fear & Greed Index
fear_greed = MarketMetric.objects.create(
    metric_type='fear_greed_index',
    timestamp=datetime.now(timezone.utc),
    value=Decimal('72'),
    metadata={'classification': 'greed'}
)

# Query historical data
btc_dom_history = MarketMetric.objects.filter(
    metric_type='btc_dominance',
    timestamp__gte=datetime(2024, 1, 1, tzinfo=timezone.utc)
).order_by('timestamp')

Indexing Strategy

Composite Index Design

Our indexes follow a left-to-right matching pattern for optimal query performance:

  1. candle_lookup_idx (exchange, symbol, interval, timestamp)
  2. Optimizes: WHERE exchange='binance' AND symbol='BTCUSDT' AND interval='1h' AND timestamp >= '...'
  3. Also works for: Queries on just exchange, or exchange+symbol, or exchange+symbol+interval

  4. symbol_time_idx (symbol, timestamp)

  5. Optimizes: Multi-exchange queries for a single symbol
  6. Use case: Comparing same pair across exchanges

  7. exchange_time_idx (exchange, timestamp)

  8. Optimizes: Exchange-wide analytics
  9. Use case: Total volume across all pairs on an exchange

Index Usage Guidelines

# ✅ Good - Uses candle_lookup_idx
Candlestick.objects.filter(
    exchange='binance',
    symbol='BTCUSDT',
    interval='1h',
    timestamp__gte=start_date
)

# ✅ Good - Uses symbol_time_idx
Candlestick.objects.filter(
    symbol='BTCUSDT',
    timestamp__range=(start, end)
)

# ❌ Avoid - No index, full table scan
Candlestick.objects.filter(
    interval='1h',  # Not a leading index column
    timestamp__gte=start_date
)

# Fix: Add exchange and symbol filters
Candlestick.objects.filter(
    exchange='binance',
    symbol='BTCUSDT',
    interval='1h',
    timestamp__gte=start_date
)

Query Performance Tips

  1. Always filter by exchange and symbol when querying specific data
  2. Use timestamp ranges to limit result sets
  3. Add .only() to select specific fields and reduce data transfer
  4. Use .iterator() for large result sets to reduce memory usage
  5. Consider database indexes when adding new query patterns

Example optimized query:

# Optimized query
candles = Candlestick.objects.filter(
    exchange='binance',
    symbol='BTCUSDT',
    interval='1h',
    timestamp__gte=start_date,
    timestamp__lt=end_date
).only(
    'timestamp', 'close_price', 'volume'
).order_by('timestamp')

# For very large result sets
for candle in candles.iterator(chunk_size=1000):
    process(candle)

Migration Guide

Initial Setup

# Create migrations
cd src
python manage.py makemigrations api

# Apply migrations
python manage.py migrate api

# Verify tables
python manage.py dbshell
\dt  # List tables
\d candlesticks  # Describe table

Adding New Indexes

When adding new query patterns, you may need additional indexes:

# In api/models.py, add to Meta.indexes
class Meta:
    indexes = [
        # ... existing indexes
        models.Index(fields=['symbol', 'interval', 'timestamp'], name='symbol_interval_time_idx'),
    ]

Then create and apply migration:

python manage.py makemigrations api --name add_symbol_interval_index
python manage.py migrate api

Data Backfill

When backfilling historical data:

# Bulk insert for performance
from api.models import Candlestick
from datetime import datetime, timezone
from decimal import Decimal

candles_to_create = []
for data in historical_data:
    candles_to_create.append(Candlestick(
        exchange='binance',
        symbol='BTCUSDT',
        interval='1h',
        timestamp=data['timestamp'],
        open_price=Decimal(str(data['open'])),
        high_price=Decimal(str(data['high'])),
        low_price=Decimal(str(data['low'])),
        close_price=Decimal(str(data['close'])),
        volume=Decimal(str(data['volume']))
    ))

# Bulk create in batches
batch_size = 1000
for i in range(0, len(candles_to_create), batch_size):
    batch = candles_to_create[i:i + batch_size]
    Candlestick.objects.bulk_create(batch, ignore_conflicts=True)

Database Maintenance

Regular Tasks

Daily: - Monitor disk space usage - Check slow query log - Verify data collection is running

Weekly: - Run VACUUM ANALYZE on large tables - Review and archive old data if needed - Check index usage statistics

Monthly: - Review query performance - Consider table partitioning if tables exceed 10M rows - Update statistics for query planner

Vacuum and Analyze

-- Vacuum and analyze (reclaim space and update statistics)
VACUUM ANALYZE candlesticks;
VACUUM ANALYZE trades;
VACUUM ANALYZE order_books;
VACUUM ANALYZE market_metrics;

Monitoring Queries

-- Table sizes
SELECT 
    schemaname,
    tablename,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Index usage
SELECT 
    schemaname,
    tablename,
    indexname,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch
FROM pg_stat_user_indexes
WHERE schemaname = 'public'
ORDER BY idx_scan DESC;

-- Slow queries (requires pg_stat_statements)
SELECT 
    query,
    calls,
    total_time,
    mean_time,
    rows
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;

Advanced Topics

Table Partitioning

For tables with 10M+ rows, consider partitioning by timestamp:

-- Create partitioned table (PostgreSQL 10+)
CREATE TABLE candlesticks_partitioned (
    LIKE candlesticks INCLUDING ALL
) PARTITION BY RANGE (timestamp);

-- Create monthly partitions
CREATE TABLE candlesticks_2024_01 PARTITION OF candlesticks_partitioned
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

CREATE TABLE candlesticks_2024_02 PARTITION OF candlesticks_partitioned
    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');

TimescaleDB Extension

For better time-series performance, consider TimescaleDB:

-- Enable TimescaleDB
CREATE EXTENSION IF NOT EXISTS timescaledb;

-- Convert to hypertable
SELECT create_hypertable('candlesticks', 'timestamp');

-- Create retention policy (keep 1 year)
SELECT add_retention_policy('candlesticks', INTERVAL '1 year');

-- Continuous aggregates for common queries
CREATE MATERIALIZED VIEW candlesticks_daily
WITH (timescaledb.continuous) AS
SELECT 
    time_bucket('1 day', timestamp) AS day,
    exchange,
    symbol,
    first(open_price, timestamp) AS open,
    max(high_price) AS high,
    min(low_price) AS low,
    last(close_price, timestamp) AS close,
    sum(volume) AS volume
FROM candlesticks
GROUP BY day, exchange, symbol;

Troubleshooting

Slow Queries

-- Enable query logging in postgresql.conf
log_min_duration_statement = 1000  # Log queries > 1 second

-- Check if indexes are being used
EXPLAIN ANALYZE
SELECT * FROM candlesticks
WHERE exchange='binance' AND symbol='BTCUSDT'
  AND timestamp >= '2024-01-01';

Duplicate Data

# Find duplicates
from django.db.models import Count

duplicates = Candlestick.objects.values(
    'exchange', 'symbol', 'interval', 'timestamp'
).annotate(
    count=Count('id')
).filter(count__gt=1)

# Remove duplicates (keep oldest)
for dup in duplicates:
    candles = Candlestick.objects.filter(
        exchange=dup['exchange'],
        symbol=dup['symbol'],
        interval=dup['interval'],
        timestamp=dup['timestamp']
    ).order_by('created_at')

    # Delete all except first
    candles[1:].delete()

Database Bloat

-- Check table bloat
SELECT 
    schemaname, 
    tablename,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
    pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
    pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename) - pg_relation_size(schemaname||'.'||tablename)) AS index_size
FROM pg_tables
WHERE schemaname = 'public'
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Resolve with VACUUM FULL (requires exclusive lock, use during off-hours)
VACUUM FULL ANALYZE candlesticks;

Resources