🇪🇸 Leer en Español 🇺🇸 English
Machine Learning Applied to Quantitative Trading
Introduction
Machine Learning offers powerful tools for detecting non-linear patterns in financial markets, identifying market regimes, and creating predictive models. This documentation covers practical implementations specifically validated for quantitative trading.
Unsupervised Models
Hidden Markov Models (HMM) for Regime Detection
HMMs are especially useful for identifying hidden market states (bullish/bearish, high/low volatility) that are not directly observable but influence price behavior.
Core Concepts
What Are Markov States?
- They represent discrete conditions a system can occupy
- Only the current state matters for predicting the next step
- Hidden states influence the observations (prices) we see
Applications in Trading:
- Market regime detection (bullish/bearish)
- Identifying high/low volatility periods
- Changes in market structure
- Entry/exit signals based on state transitions
Basic Implementation with HMM
import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn import hmm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
class MarketRegimeDetector:
"""
Market regime detector using Hidden Markov Models
"""
def __init__(self, n_components=2, covariance_type="full", random_state=42):
"""
Parameters
----------
n_components : int
Number of hidden states (typically 2-4 for markets)
covariance_type : str
Covariance matrix type ('full', 'diag', 'tied', 'spherical')
random_state : int
Seed for reproducibility
"""
self.n_components = n_components
self.model = hmm.GaussianHMM(
n_components=n_components,
covariance_type=covariance_type,
random_state=random_state
)
self.scaler = StandardScaler()
self.is_fitted = False
def prepare_features(self, df):
"""
Prepare features for the HMM model
Parameters
----------
df : pd.DataFrame
DataFrame with OHLCV columns
Returns
-------
np.array
Array of normalized features
"""
features = pd.DataFrame(index=df.index)
# Log returns
features['log_returns'] = np.log(df['Close'] / df['Close'].shift(1))
# Range diario normalizado
features['daily_range'] = (df['High'] / df['Low']) - 1
# Realized volatility (5-day window)
features['realized_vol'] = features['log_returns'].rolling(5).std()
# Relative volume
features['volume_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()
# RSI as momentum proxy
features['rsi'] = self.calculate_rsi(df['Close'], period=14)
# Remove NaN and normalize
features_clean = features.dropna()
features_scaled = self.scaler.fit_transform(features_clean)
return features_scaled, features_clean.index
def calculate_rsi(self, series, period=14):
"""Calculate RSI"""
delta = series.diff()
gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
rs = gain / loss
return 100 - (100 / (1 + rs))
def fit(self, df):
"""
Train the HMM model
Parameters
----------
df : pd.DataFrame
Historical data with OHLCV columns
"""
features, self.feature_index = self.prepare_features(df)
# Train the model
self.model.fit(features)
self.is_fitted = True
# Predict states
self.hidden_states = self.model.predict(features)
# Save data for analysis
self.features = features
self.original_data = df.loc[self.feature_index]
return self
def predict_current_regime(self, df):
"""
Predict the current market regime
Returns
-------
dict
Current regime information
"""
if not self.is_fitted:
raise ValueError("Model not trained. Run fit() first.")
features, _ = self.prepare_features(df)
# Predict current state
current_state = self.model.predict(features[-1:].reshape(1, -1))[0]
# Calculate probabilities
log_prob, state_sequence = self.model.decode(features[-10:], algorithm="viterbi")
state_probs = np.exp(self.model.predict_proba(features[-1:].reshape(1, -1)))[0]
return {
'current_state': current_state,
'state_probabilities': state_probs,
'confidence': np.max(state_probs),
'recent_sequence': state_sequence
}
def analyze_regimes(self):
"""
Analyze the characteristics of each regime
"""
if not self.is_fitted:
raise ValueError("Model not trained.")
regime_analysis = {}
for state in range(self.n_components):
mask = self.hidden_states == state
state_data = self.original_data[mask]
if len(state_data) > 0:
avg_return = state_data['Close'].pct_change().mean()
volatility = state_data['Close'].pct_change().std()
avg_volume = state_data['Volume'].mean()
duration = len(state_data)
regime_analysis[f'State_{state}'] = {
'average_return': avg_return,
'volatility': volatility,
'average_volume': avg_volume,
'duration_days': duration,
'percentage_time': duration / len(self.hidden_states),
'regime_type': 'Bullish' if avg_return > 0 else 'Bearish'
}
return regime_analysis
def plot_regimes(self, title="Market Regimes Detection"):
"""
Visualize the detected regimes
"""
if not self.is_fitted:
raise ValueError("Model not trained.")
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
# Plot 1: Price with regimes
colors = ['green', 'red', 'blue', 'orange'][:self.n_components]
for state in range(self.n_components):
mask = self.hidden_states == state
state_data = self.original_data[mask]
ax1.scatter(state_data.index, state_data['Close'],
c=colors[state], label=f'Regime {state}', alpha=0.6, s=10)
ax1.plot(self.original_data.index, self.original_data['Close'],
'k-', alpha=0.3, linewidth=0.5)
ax1.set_title(f'{title} - Price Action')
ax1.set_ylabel('Price')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Plot 2: State sequence
ax2.plot(self.feature_index, self.hidden_states, 'k-', linewidth=2)
ax2.fill_between(self.feature_index, 0, self.hidden_states, alpha=0.3)
ax2.set_title('Hidden States Sequence')
ax2.set_ylabel('State')
ax2.set_xlabel('Date')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return fig
def hmm_trading_strategy(df, detector, confidence_threshold=0.7):
"""
HMM-based trading strategy
Parameters
----------
df : pd.DataFrame
Historical data
detector : MarketRegimeDetector
Trained detector
confidence_threshold : float
Confidence threshold for generating signals
"""
signals = pd.DataFrame(index=df.index)
signals['price'] = df['Close']
signals['signal'] = 0
signals['regime'] = np.nan
signals['confidence'] = np.nan
# Rolling window for predictions
window_size = 252 # 1 year of data
for i in range(window_size, len(df)):
# Training data
train_data = df.iloc[i-window_size:i]
# Train detector
temp_detector = MarketRegimeDetector(n_components=2)
temp_detector.fit(train_data)
# Predict current regime
current_data = df.iloc[i-50:i+1] # Last 50 days for context
regime_info = temp_detector.predict_current_regime(current_data)
current_idx = df.index[i]
signals.loc[current_idx, 'regime'] = regime_info['current_state']
signals.loc[current_idx, 'confidence'] = regime_info['confidence']
# Generate signals only with high confidence
if regime_info['confidence'] > confidence_threshold:
# Analyze regime characteristics
regime_analysis = temp_detector.analyze_regimes()
current_regime = f"State_{regime_info['current_state']}"
if current_regime in regime_analysis:
regime_return = regime_analysis[current_regime]['average_return']
# Signal based on regime type
if regime_return > 0.001: # Bullish regime
signals.loc[current_idx, 'signal'] = 1
elif regime_return < -0.001: # Bearish regime
signals.loc[current_idx, 'signal'] = -1
return signals
# Usage example completo
def hmm_example_analysis():
"""
Complete HMM analysis example for trading
"""
# Get data
ticker = "SPY"
df = yf.download(ticker, start="2020-01-01", end="2024-01-01", interval="1d")
print(f"=== HMM ANALYSIS: {ticker} ===\n")
# Create and train detector
detector = MarketRegimeDetector(n_components=2, random_state=42)
detector.fit(df)
# Analyze regimes
regime_analysis = detector.analyze_regimes()
print("📊 REGIME ANALYSIS:")
for regime, stats in regime_analysis.items():
print(f"\n{regime} ({stats['regime_type']}):")
print(f" Average Return: {stats['average_return']:.4f}")
print(f" Volatility: {stats['volatility']:.4f}")
print(f" Duration: {stats['duration_days']} days")
print(f" % of Time: {stats['percentage_time']:.1%}")
# Predict current regime
current_regime = detector.predict_current_regime(df)
print(f"\n🎯 CURRENT REGIME:")
print(f" State: {current_regime['current_state']}")
print(f" Confidence: {current_regime['confidence']:.1%}")
print(f" Probabilities: {current_regime['state_probabilities']}")
# Generate strategy
strategy_signals = hmm_trading_strategy(df, detector)
# Strategy statistics
total_signals = strategy_signals['signal'].abs().sum()
long_signals = (strategy_signals['signal'] == 1).sum()
short_signals = (strategy_signals['signal'] == -1).sum()
print(f"\n📈 STRATEGY STATISTICS:")
print(f" Total Signals: {total_signals}")
print(f" Long Signals: {long_signals}")
print(f" Short Signals: {short_signals}")
# Visualize
detector.plot_regimes(f"HMM Regime Detection - {ticker}")
return detector, strategy_signals
if __name__ == "__main__":
hmm_example_analysis()
Advanced Strategy: Multi-State HMM
class AdvancedMarketRegimeDetector:
"""
Advanced detector with multiple states for complex markets
"""
def __init__(self, n_components=4):
"""
4 States típicos:
0: Bull Market
1: Bear Market
2: High Volatility (crisis)
3: Low Volatility (consolidation)
"""
self.n_components = n_components
self.model = hmm.GaussianHMM(
n_components=n_components,
covariance_type="full",
random_state=42
)
def prepare_advanced_features(self, df):
"""
Advanced features for multi-state detection
"""
features = pd.DataFrame(index=df.index)
# Returns across multiple timeframes
features['returns_1d'] = df['Close'].pct_change()
features['returns_5d'] = df['Close'].pct_change(5)
features['returns_20d'] = df['Close'].pct_change(20)
# Volatilityes realizadas
features['vol_5d'] = features['returns_1d'].rolling(5).std()
features['vol_20d'] = features['returns_1d'].rolling(20).std()
features['vol_60d'] = features['returns_1d'].rolling(60).std()
# Momentum indicators
features['rsi'] = self.calculate_rsi(df['Close'])
features['macd'] = self.calculate_macd(df['Close'])
# Volume patterns
features['volume_trend'] = df['Volume'].rolling(20).mean() / df['Volume'].rolling(60).mean()
features['volume_spike'] = df['Volume'] / df['Volume'].rolling(20).mean()
# VIX proxy (volatility of volatility)
features['vol_of_vol'] = features['vol_20d'].rolling(10).std()
return features.dropna()
def calculate_macd(self, series, fast=12, slow=26, signal=9):
"""Calculate MACD"""
ema_fast = series.ewm(span=fast).mean()
ema_slow = series.ewm(span=slow).mean()
macd_line = ema_fast - ema_slow
return macd_line
def fit_advanced(self, df):
"""Train advanced model"""
features = self.prepare_advanced_features(df)
# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Train model
self.model.fit(features_scaled)
self.hidden_states = self.model.predict(features_scaled)
# Interpret states
self.regime_interpretation = self.interpret_regimes(df, features)
return self
def interpret_regimes(self, df, features):
"""
Automatically interpret what each state represents
"""
interpretation = {}
for state in range(self.n_components):
mask = self.hidden_states == state
state_features = features[mask]
if len(state_features) > 0:
avg_return = state_features['returns_1d'].mean()
avg_vol = state_features['vol_20d'].mean()
avg_rsi = state_features['rsi'].mean()
# Classify state based on features
if avg_return > 0.001 and avg_vol < state_features['vol_20d'].quantile(0.5):
regime_type = "Bull Market"
elif avg_return < -0.001 and avg_vol < state_features['vol_20d'].quantile(0.5):
regime_type = "Bear Market"
elif avg_vol > state_features['vol_20d'].quantile(0.75):
regime_type = "High Volatility/Crisis"
else:
regime_type = "Consolidation/Low Volatility"
interpretation[state] = {
'type': regime_type,
'avg_return': avg_return,
'avg_volatility': avg_vol,
'avg_rsi': avg_rsi,
'frequency': np.mean(mask)
}
return interpretation
def small_cap_hmm_strategy(df, lookback_days=252):
"""
Small cap-specific HMM strategy
"""
# Small cap-specific parameters
detector = MarketRegimeDetector(n_components=3) # 3 states: bullish, bearish, volatile
# Small cap-specific features
features = pd.DataFrame(index=df.index)
# Gap detection
features['gap_pct'] = (df['Open'] / df['Close'].shift(1)) - 1
# Intraday range
features['intraday_range'] = (df['High'] - df['Low']) / df['Open']
# Volume spikes (crucial for small caps)
features['volume_spike'] = df['Volume'] / df['Volume'].rolling(20).mean()
# Price momentum
features['momentum_5d'] = df['Close'] / df['Close'].shift(5) - 1
# Relative strength vs market
spy_data = yf.download("SPY", start=df.index[0], end=df.index[-1])
features['relative_strength'] = (df['Close'].pct_change() -
spy_data['Close'].pct_change().reindex(df.index))
# Fit model with small cap specific features
features_clean = features.dropna()
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_clean)
detector.model.fit(features_scaled)
hidden_states = detector.model.predict(features_scaled)
return {
'states': hidden_states,
'features': features_clean,
'index': features_clean.index,
'detector': detector
}
Supervised Models for Trading
Price Prediction with XGBoost
Supervised models can predict future price movements based on historical features.
import pandas as pd
import numpy as np
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
class TradingPredictor:
"""
Price and direction predictor using XGBoost
"""
def __init__(self, prediction_type='price', target_days=1):
"""
Parameters
----------
prediction_type : str
'price' for regression, 'direction' for classification
target_days : int
Days ahead to predict
"""
self.prediction_type = prediction_type
self.target_days = target_days
if prediction_type == 'price':
self.model = XGBRegressor(
max_depth=6,
n_estimators=100,
learning_rate=0.1,
random_state=42
)
else:
self.model = XGBClassifier(
max_depth=6,
n_estimators=100,
learning_rate=0.1,
random_state=42
)
def create_features(self, df):
"""
Create features for machine learning
"""
features = pd.DataFrame(index=df.index)
# Price-based features
features['sma_5'] = df['Close'].rolling(5).mean() / df['Close']
features['sma_10'] = df['Close'].rolling(10).mean() / df['Close']
features['sma_20'] = df['Close'].rolling(20).mean() / df['Close']
# Volatility features
features['volatility_5'] = df['Close'].pct_change().rolling(5).std()
features['volatility_20'] = df['Close'].pct_change().rolling(20).std()
# Momentum features
features['roc_5'] = df['Close'].pct_change(5)
features['roc_10'] = df['Close'].pct_change(10)
features['roc_20'] = df['Close'].pct_change(20)
# Technical indicators
features['rsi'] = self.calculate_rsi(df['Close'])
features['bb_position'] = self.calculate_bb_position(df['Close'])
# Volume features
features['volume_sma'] = df['Volume'] / df['Volume'].rolling(20).mean()
features['price_volume'] = df['Close'] * df['Volume']
# OHLC features
features['high_low_pct'] = (df['High'] - df['Low']) / df['Close']
features['open_close_pct'] = (df['Close'] - df['Open']) / df['Open']
# Lag features
for lag in [1, 2, 3, 5]:
features[f'return_lag_{lag}'] = df['Close'].pct_change().shift(lag)
features[f'volume_lag_{lag}'] = df['Volume'].pct_change().shift(lag)
return features.dropna()
def calculate_rsi(self, series, period=14):
"""Calculate RSI"""
delta = series.diff()
gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
rs = gain / loss
return 100 - (100 / (1 + rs))
def calculate_bb_position(self, series, period=20, std_mult=2):
"""Calculate position within Bollinger Bands"""
sma = series.rolling(period).mean()
std = series.rolling(period).std()
upper = sma + (std * std_mult)
lower = sma - (std * std_mult)
return (series - lower) / (upper - lower)
def create_targets(self, df):
"""
Create target variables
"""
if self.prediction_type == 'price':
# Predict future price
target = df['Close'].shift(-self.target_days)
else:
# Predict direction (classification)
future_return = df['Close'].pct_change(self.target_days).shift(-self.target_days)
target = (future_return > 0).astype(int) # 1 if up, 0 if down
return target
def fit(self, df, test_size=0.2):
"""
Train the model
"""
# Create features and targets
features = self.create_features(df)
targets = self.create_targets(df)
# Align data
aligned_data = pd.concat([features, targets], axis=1).dropna()
X = aligned_data.iloc[:, :-1] # All columns except the last one
y = aligned_data.iloc[:, -1] # Última columna (target)
# Temporal split (important for time series)
split_point = int(len(X) * (1 - test_size))
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]
# Train model
self.model.fit(X_train, y_train)
# Evaluate
train_pred = self.model.predict(X_train)
test_pred = self.model.predict(X_test)
# Metrics
if self.prediction_type == 'price':
train_mse = mean_squared_error(y_train, train_pred)
test_mse = mean_squared_error(y_test, test_pred)
self.metrics = {
'train_mse': train_mse,
'test_mse': test_mse,
'train_rmse': np.sqrt(train_mse),
'test_rmse': np.sqrt(test_mse)
}
else:
train_accuracy = (train_pred.round() == y_train).mean()
test_accuracy = (test_pred.round() == y_test).mean()
self.metrics = {
'train_accuracy': train_accuracy,
'test_accuracy': test_accuracy
}
# Save data for analysis
self.X_train, self.X_test = X_train, X_test
self.y_train, self.y_test = y_train, y_test
self.train_pred, self.test_pred = train_pred, test_pred
self.feature_names = X.columns.tolist()
return self
def predict_next(self, df, periods=1):
"""
Predict next periods
"""
features = self.create_features(df)
latest_features = features.iloc[-periods:].values
if latest_features.shape[0] == 0:
raise ValueError("Not enough data to generate features")
predictions = self.model.predict(latest_features)
return predictions
def get_feature_importance(self, top_n=10):
"""
Get feature importance
"""
if not hasattr(self.model, 'feature_importances_'):
raise ValueError("Model not trained")
importance_df = pd.DataFrame({
'feature': self.feature_names,
'importance': self.model.feature_importances_
}).sort_values('importance', ascending=False)
return importance_df.head(top_n)
def plot_predictions(self, title="Predictions vs Reality"):
"""
Visualize predicciones vs realidad
"""
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# Training set
ax1.scatter(self.y_train, self.train_pred, alpha=0.6)
ax1.plot([self.y_train.min(), self.y_train.max()],
[self.y_train.min(), self.y_train.max()], 'r--', lw=2)
ax1.set_xlabel('Actual')
ax1.set_ylabel('Predicted')
ax1.set_title('Training Set')
ax1.grid(True, alpha=0.3)
# Test set
ax2.scatter(self.y_test, self.test_pred, alpha=0.6, color='orange')
ax2.plot([self.y_test.min(), self.y_test.max()],
[self.y_test.min(), self.y_test.max()], 'r--', lw=2)
ax2.set_xlabel('Actual')
ax2.set_ylabel('Predicted')
ax2.set_title('Test Set')
ax2.grid(True, alpha=0.3)
plt.suptitle(title)
plt.tight_layout()
plt.show()
return fig
# ML-based trading strategy
def ml_trading_strategy(df, prediction_threshold=0.6):
"""
Trading strategy using ML predictions
"""
# Train direction predictor
direction_predictor = TradingPredictor(prediction_type='direction', target_days=1)
direction_predictor.fit(df)
# Train price predictor
price_predictor = TradingPredictor(prediction_type='price', target_days=1)
price_predictor.fit(df)
# Generate signals
signals = pd.DataFrame(index=df.index)
signals['price'] = df['Close']
signals['signal'] = 0
signals['confidence'] = 0
# Rolling window for predictions
window_size = 252
for i in range(window_size, len(df) - 1):
train_data = df.iloc[i-window_size:i]
try:
# Train models with data up to this point
temp_direction = TradingPredictor(prediction_type='direction')
temp_direction.fit(train_data, test_size=0.3)
# Predict direction
direction_pred = temp_direction.predict_next(train_data)[0]
# Only generate signal if confidence is high
if temp_direction.metrics['test_accuracy'] > prediction_threshold:
current_idx = df.index[i]
if direction_pred > 0.5: # Bullish prediction
signals.loc[current_idx, 'signal'] = 1
else: # Bearish prediction
signals.loc[current_idx, 'signal'] = -1
signals.loc[current_idx, 'confidence'] = temp_direction.metrics['test_accuracy']
except Exception as e:
continue
return signals
# Usage example completo
def ml_example_analysis():
"""
Complete ML analysis example for trading
"""
# Get data
ticker = "AAPL"
df = yf.download(ticker, start="2020-01-01", end="2024-01-01", interval="1d")
print(f"=== ML ANALYSIS: {ticker} ===\n")
# Direction predictor
print("🎯 DIRECTION PREDICTOR:")
direction_model = TradingPredictor(prediction_type='direction', target_days=1)
direction_model.fit(df)
print(f" Training Accuracy: {direction_model.metrics['train_accuracy']:.1%}")
print(f" Test Accuracy: {direction_model.metrics['test_accuracy']:.1%}")
# Price predictor
print(f"\n📈 PRICE PREDICTOR:")
price_model = TradingPredictor(prediction_type='price', target_days=1)
price_model.fit(df)
print(f" Training RMSE: ${price_model.metrics['train_rmse']:.2f}")
print(f" Test RMSE: ${price_model.metrics['test_rmse']:.2f}")
# Importancia de características
print(f"\n🔍 TOP FEATURES:")
importance = direction_model.get_feature_importance(5)
for _, row in importance.iterrows():
print(f" {row['feature']}: {row['importance']:.3f}")
# Current predictions
latest_direction = direction_model.predict_next(df, 1)[0]
latest_price = price_model.predict_next(df, 1)[0]
current_price = df['Close'].iloc[-1]
print(f"\n🔮 PREDICTIONS:")
print(f" Next Day Direction: {'Up' if latest_direction > 0.5 else 'Down'}")
print(f" Current Price: ${current_price:.2f}")
print(f" Predicted Price: ${latest_price:.2f}")
print(f" Expected Change: {(latest_price/current_price - 1):.1%}")
# Generate strategy
strategy_signals = ml_trading_strategy(df)
# Strategy statistics
total_signals = strategy_signals['signal'].abs().sum()
avg_confidence = strategy_signals[strategy_signals['confidence'] > 0]['confidence'].mean()
print(f"\n📊 ML STRATEGY:")
print(f" Total Signals: {total_signals}")
print(f" Confidence Promedio: {avg_confidence:.1%}")
# Visualize
direction_model.plot_predictions(f"Direction Prediction - {ticker}")
price_model.plot_predictions(f"Price Prediction - {ticker}")
return direction_model, price_model, strategy_signals
if __name__ == "__main__":
ml_example_analysis()
Best Practices for ML in Trading
1. Temporal Validation
def time_series_cross_validation(df, model_class, n_splits=5):
"""
Time series-specific cross-validation
"""
tscv = TimeSeriesSplit(n_splits=n_splits)
scores = []
for train_idx, test_idx in tscv.split(df):
train_data = df.iloc[train_idx]
test_data = df.iloc[test_idx]
model = model_class()
model.fit(train_data)
# Evaluate on test data
test_score = model.evaluate(test_data)
scores.append(test_score)
return np.array(scores)
2. Advanced Feature Engineering
def create_advanced_features(df, market_data=None):
"""
Create advanced features for ML
"""
features = df.copy()
# Market regime features
if market_data is not None:
features['beta'] = calculate_rolling_beta(df['Close'], market_data['Close'])
features['relative_strength'] = df['Close'].pct_change() - market_data['Close'].pct_change()
# Technical pattern features
features['doji'] = detect_doji_patterns(df)
features['hammer'] = detect_hammer_patterns(df)
features['engulfing'] = detect_engulfing_patterns(df)
# Volatility clustering
features['vol_regime'] = detect_volatility_regime(df['Close'])
# Seasonal features
features['day_of_week'] = df.index.dayofweek
features['month'] = df.index.month
features['quarter'] = df.index.quarter
return features
3. Overfitting Management
class OverfittingDetector:
"""
Overfitting detector for trading models
"""
def __init__(self):
self.warnings = []
def check_overfitting(self, train_score, test_score, threshold=0.1):
"""
Detect overfitting by comparing scores
"""
if abs(train_score - test_score) > threshold:
self.warnings.append("High difference between train/test scores")
if train_score > 0.95: # Too perfect
self.warnings.append("Training score suspiciously high")
return len(self.warnings) == 0
def suggest_fixes(self):
"""
Suggest fixes for overfitting
"""
suggestions = [
"Reduce model complexity (max_depth, n_estimators)",
"Add regularization (L1/L2)",
"Increase training data",
"Use feature selection",
"Implement early stopping"
]
return suggestions
Specific Applications for Small Caps
1. Gap Prediction
def gap_prediction_model(df, gap_threshold=0.02):
"""
Model specifically for predicting gaps in small caps
"""
features = pd.DataFrame(index=df.index)
# Previous day features
features['prev_close_vol'] = df['Volume'].shift(1)
features['prev_range'] = ((df['High'] - df['Low']) / df['Close']).shift(1)
features['prev_return'] = df['Close'].pct_change().shift(1)
# After-hours indicators
features['ah_volume'] = df['Volume'].rolling(5).mean() # Proxy
features['news_sentiment'] = 0 # Placeholder for news sentiment
# Create gap target
gap_pct = (df['Open'] / df['Close'].shift(1)) - 1
target = (abs(gap_pct) > gap_threshold).astype(int)
return features, target
2. Volatility Prediction
def volatility_prediction_model(df, horizon=5):
"""
Predict future volatility for small caps
"""
# GARCH-like features
returns = df['Close'].pct_change()
features = pd.DataFrame(index=df.index)
features['returns_lag1'] = returns.shift(1)
features['returns_lag2'] = returns.shift(2)
features['vol_lag1'] = returns.rolling(5).std().shift(1)
features['vol_lag2'] = returns.rolling(10).std().shift(1)
# Target: future volatility
target = returns.rolling(horizon).std().shift(-horizon)
return features.dropna(), target.dropna()
Evaluation Metrics for Trading
def evaluate_trading_model(predictions, actual_returns, transaction_cost=0.001):
"""
Evaluate modelo desde perspectiva de trading
"""
# Convert predictions to trading signals
signals = np.where(predictions > 0.5, 1, -1)
# Calculate strategy returns
strategy_returns = signals * actual_returns - abs(np.diff(signals, prepend=signals[0])) * transaction_cost
# Trading-specific metrics
sharpe_ratio = np.sqrt(252) * strategy_returns.mean() / strategy_returns.std()
max_drawdown = calculate_max_drawdown(strategy_returns.cumsum())
hit_rate = (strategy_returns > 0).mean()
return {
'sharpe_ratio': sharpe_ratio,
'max_drawdown': max_drawdown,
'hit_rate': hit_rate,
'total_return': strategy_returns.sum(),
'volatility': strategy_returns.std() * np.sqrt(252)
}
def calculate_max_drawdown(equity_curve):
"""Calculate maximum drawdown"""
peak = equity_curve.cummax()
drawdown = (equity_curve - peak) / peak
return drawdown.min()
Next Step
With Machine Learning mastered, let’s continue with Sentiment Analysis to incorporate alternative data into our strategies.