🇺🇸 Read in English 🇪🇸 Español
Creación de Datasets para Backtesting
El Framework Mental
Un buen dataset de backtesting no es solo data histórica. Es una representación realista del ambiente de trading, incluyendo todas las limitaciones y fricciones que enfrentarás en vivo.
Estructura Base
class BacktestDataset:
def __init__(self):
self.price_data = {} # OHLCV data
self.fundamental_data = {} # Float, sector, etc
self.universe = [] # Tickers tradeable cada día
self.metadata = {} # Info adicional
def add_ticker(self, ticker, data, metadata=None):
# Validar data
if not self._validate_data(data):
raise ValueError(f"Invalid data for {ticker}")
self.price_data[ticker] = data
if metadata:
self.metadata[ticker] = metadata
def get_universe(self, date):
"""Obtener tickers tradeables en una fecha"""
available = []
for ticker, data in self.price_data.items():
if date in data.index:
# Verificar liquidez mínima
volume = data.loc[date, 'volume']
price = data.loc[date, 'close']
dollar_volume = volume * price
if dollar_volume > 1_000_000: # $1M mínimo
available.append(ticker)
return available
Datasets por Tipo de Estrategia
1. Dataset para Gap Trading
def create_gap_trading_dataset(start_date, end_date):
dataset = BacktestDataset()
# Universo: Small caps con volumen
universe_criteria = {
'market_cap': (10_000_000, 500_000_000),
'avg_volume': 500_000,
'price': (1, 50)
}
# Obtener tickers que cumplan criterios
tickers = screen_universe(universe_criteria)
for ticker in tickers:
# Necesitamos pre-market data
data = fetch_extended_hours_data(ticker, start_date, end_date)
# Calcular gaps
data['gap_pct'] = (data['open'] / data['close'].shift(1) - 1) * 100
data['gap_type'] = data['gap_pct'].apply(classify_gap)
# Agregar indicadores relevantes
data['premarket_volume'] = calculate_premarket_volume(data)
data['rvol'] = data['volume'] / data['volume'].rolling(20).mean()
# Metadata importante
metadata = {
'float': get_float(ticker),
'sector': get_sector(ticker),
'avg_spread': calculate_avg_spread(data)
}
dataset.add_ticker(ticker, data, metadata)
return dataset
def classify_gap(gap_pct):
if gap_pct > 20:
return 'large_gap_up'
elif gap_pct > 10:
return 'medium_gap_up'
elif gap_pct < -10:
return 'gap_down'
else:
return 'small_gap'
2. Dataset para Mean Reversion
def create_mean_reversion_dataset(start_date, end_date):
dataset = BacktestDataset()
# Para mean reversion queremos stocks líquidos y estables
universe_criteria = {
'market_cap': (1_000_000_000, None), # Large caps
'avg_volume': 5_000_000,
'volatility': (0.01, 0.05) # Volatilidad moderada
}
tickers = screen_universe(universe_criteria)
for ticker in tickers:
data = fetch_data(ticker, start_date, end_date)
# Indicadores de mean reversion
data['sma_20'] = data['close'].rolling(20).mean()
data['distance_from_mean'] = (data['close'] - data['sma_20']) / data['sma_20']
# Bollinger Bands
data['bb_upper'], data['bb_middle'], data['bb_lower'] = calculate_bollinger_bands(data)
data['bb_position'] = (data['close'] - data['bb_lower']) / (data['bb_upper'] - data['bb_lower'])
# RSI
data['rsi'] = calculate_rsi(data['close'])
# Z-Score
data['zscore'] = (data['close'] - data['close'].rolling(20).mean()) / data['close'].rolling(20).std()
dataset.add_ticker(ticker, data)
return dataset
3. Dataset para Pairs Trading
def create_pairs_trading_dataset(start_date, end_date):
dataset = BacktestDataset()
# Obtener pares correlacionados
sectors = ['XLK', 'XLF', 'XLE', 'XLV'] # Tech, Financial, Energy, Healthcare
pairs = []
for sector in sectors:
stocks = get_sector_stocks(sector)
correlation_matrix = calculate_correlation_matrix(stocks, start_date, end_date)
# Encontrar pares con alta correlación
high_corr_pairs = find_high_correlation_pairs(correlation_matrix, threshold=0.8)
pairs.extend(high_corr_pairs)
# Crear spreads para cada par
for stock1, stock2 in pairs:
data1 = fetch_data(stock1, start_date, end_date)
data2 = fetch_data(stock2, start_date, end_date)
# Alinear datos
spread_data = pd.DataFrame(index=data1.index)
spread_data['price1'] = data1['close']
spread_data['price2'] = data2['close']
# Calcular ratio y z-score
spread_data['ratio'] = spread_data['price1'] / spread_data['price2']
spread_data['ratio_mean'] = spread_data['ratio'].rolling(20).mean()
spread_data['ratio_std'] = spread_data['ratio'].rolling(20).std()
spread_data['zscore'] = (spread_data['ratio'] - spread_data['ratio_mean']) / spread_data['ratio_std']
# Cointegration test
spread_data['is_cointegrated'] = test_cointegration(data1['close'], data2['close'])
dataset.add_ticker(f"{stock1}_{stock2}_pair", spread_data)
return dataset
Agregar Datos Fundamentales
def enrich_with_fundamentals(dataset):
"""Agregar datos fundamentales al dataset"""
for ticker in dataset.price_data.keys():
# Float data
float_data = get_historical_float(ticker)
# Short interest
short_data = get_short_interest(ticker)
# Earnings dates
earnings_dates = get_earnings_calendar(ticker)
# News sentiment
news_sentiment = get_news_sentiment(ticker)
# Agregar al dataset
dataset.metadata[ticker].update({
'float_history': float_data,
'short_interest': short_data,
'earnings_dates': earnings_dates,
'news_sentiment': news_sentiment
})
return dataset
Considerar Costos y Fricciones
class RealisticBacktestData:
def __init__(self, base_dataset):
self.data = base_dataset
self.cost_model = CostModel()
def add_market_impact(self, ticker, date, size):
"""Estimar impacto en precio por tamaño de orden"""
daily_volume = self.data.price_data[ticker].loc[date, 'volume']
participation_rate = size / daily_volume
# Modelo simple de impacto
if participation_rate < 0.01:
impact = 0.0001 # 1 basis point
elif participation_rate < 0.05:
impact = 0.0005 # 5 basis points
else:
impact = 0.001 + (participation_rate - 0.05) * 0.01
return impact
def calculate_realistic_fill(self, ticker, date, side, size):
"""Calcular precio de fill realista"""
bar = self.data.price_data[ticker].loc[date]
if side == 'buy':
# Comprar cerca del ask
base_price = bar['close'] * 1.0001 # Slight premium
impact = self.add_market_impact(ticker, date, size)
fill_price = base_price * (1 + impact)
else:
# Vender cerca del bid
base_price = bar['close'] * 0.9999 # Slight discount
impact = self.add_market_impact(ticker, date, size)
fill_price = base_price * (1 - impact)
# Agregar slippage aleatorio
slippage = np.random.normal(0, 0.0001) # 1bp std dev
fill_price *= (1 + slippage)
return fill_price
Splits y Corporate Actions
def handle_corporate_actions(dataset):
"""Manejar splits, dividendos, etc."""
for ticker in dataset.price_data.keys():
# Obtener corporate actions
actions = get_corporate_actions(ticker)
for action in actions:
if action['type'] == 'split':
ratio = action['ratio']
date = action['date']
# Ajustar precios históricos
mask = dataset.price_data[ticker].index < date
dataset.price_data[ticker].loc[mask, ['open', 'high', 'low', 'close']] /= ratio
dataset.price_data[ticker].loc[mask, 'volume'] *= ratio
elif action['type'] == 'dividend':
# Ajustar por dividendos si es necesario
div_amount = action['amount']
date = action['date']
# Algunos backtests ajustan, otros no
# Depende de tu estrategia
Survivorship Bias
def create_survivorship_bias_free_dataset(start_date, end_date):
"""Incluir empresas que quebraron o fueron delistadas"""
dataset = BacktestDataset()
# Obtener lista histórica de tickers
historical_universe = get_historical_constituents('Russell3000', start_date)
for ticker in historical_universe:
try:
data = fetch_data(ticker, start_date, end_date)
# Marcar si fue delistado
if ticker in get_delisted_tickers():
data['is_delisted'] = True
data['delisting_date'] = get_delisting_date(ticker)
dataset.add_ticker(ticker, data)
except DataNotAvailable:
# Importante: incluir incluso si no hay data completa
print(f"Warning: Limited data for {ticker}")
Optimización para Velocidad
class OptimizedBacktestData:
def __init__(self, dataset):
self.dataset = dataset
self._create_indexes()
def _create_indexes(self):
"""Pre-calcular índices para lookups rápidos"""
# Crear MultiIndex para acceso rápido
all_data = []
for ticker, df in self.dataset.price_data.items():
df['ticker'] = ticker
all_data.append(df)
self.combined_df = pd.concat(all_data)
self.combined_df.set_index(['ticker', self.combined_df.index], inplace=True)
# Pre-calcular universos por fecha
self.daily_universes = {}
for date in self.combined_df.index.get_level_values(1).unique():
self.daily_universes[date] = self.dataset.get_universe(date)
def get_bar(self, ticker, date):
"""Acceso ultra-rápido a una barra"""
return self.combined_df.loc[(ticker, date)]
def get_multiple_bars(self, tickers, date):
"""Obtener múltiples tickers eficientemente"""
return self.combined_df.loc[(tickers, date)]
Validación Final del Dataset
def validate_backtest_dataset(dataset):
"""Validaciones cruciales antes de backtest"""
issues = []
# 1. Verificar look-ahead bias
for ticker, data in dataset.price_data.items():
# Buscar indicadores que usan datos futuros
for col in data.columns:
if 'shift(-' in str(data[col]):
issues.append(f"Possible look-ahead bias in {ticker}:{col}")
# 2. Verificar consistencia temporal
dates = set()
for ticker, data in dataset.price_data.items():
dates.update(data.index)
# Todos los tickers deben tener fechas similares
date_coverage = {}
for ticker, data in dataset.price_data.items():
coverage = len(data) / len(dates)
if coverage < 0.8: # Menos del 80% de cobertura
issues.append(f"{ticker} has only {coverage:.1%} date coverage")
# 3. Verificar datos extremos
for ticker, data in dataset.price_data.items():
returns = data['close'].pct_change()
if (returns > 5).any(): # +500% en un día
issues.append(f"{ticker} has suspicious returns > 500%")
return issues
Mi Setup Personal
# create_my_dataset.py
def create_my_trading_dataset():
# Configuración base
config = {
'start_date': '2022-01-01',
'end_date': '2024-01-01',
'universe': 'small_caps',
'min_volume': 1_000_000,
'min_price': 1,
'max_price': 50
}
# Crear datasets para cada estrategia
datasets = {
'gap_trading': create_gap_trading_dataset(**config),
'vwap_bounce': create_mean_reversion_dataset(**config),
'momentum': create_momentum_dataset(**config)
}
# Enriquecer con fundamentales
for name, dataset in datasets.items():
enrich_with_fundamentals(dataset)
handle_corporate_actions(dataset)
# Optimizar para velocidad
optimized = {
name: OptimizedBacktestData(dataset)
for name, dataset in datasets.items()
}
return optimized
Siguiente Paso
Con datasets robustos listos, pasemos a los Indicadores Técnicos específicos para small caps.