💡 因子挖掘实战技巧
本章讲解了因子挖掘技术,这里分享实战中最有效的因子挖掘方法:
1. 因子有效性检验
def evaluate_factor(factor_values, forward_returns):
"""完整的因子评估"""
from scipy import stats
# IC(信息系数)
ic = factor_values.corr(forward_returns)
# Rank IC(更稳健)
rank_ic = factor_values.rank().corr(forward_returns.rank())
# ICIR(信息比率)
ic_series = rolling_ic(factor_values, forward_returns, window=20)
icir = ic_series.mean() / ic_series.std()
# 分组收益
quantiles = pd.qcut(factor_values, 10, labels=False, duplicates='drop')
group_returns = forward_returns.groupby(quantiles).mean()
# 多空收益
long_short = group_returns.iloc[-1] - group_returns.iloc[0]
return {
'IC': ic,
'Rank IC': rank_ic,
'ICIR': icir,
'Long-Short': long_short,
'Group Returns': group_returns
}
2. Alpha101因子实现
class Alpha101:
"""WorldQuant Alpha101 因子库"""
@staticmethod
def alpha_001(close, returns, volume):
"""Alpha#001"""
# rank(Ts_ArgMax(SignedPower(...), 5)) - 0.5
cond = returns < 0
std_20 = returns.rolling(20).std()
power = np.where(cond, std_20, close) ** 2
argmax = pd.DataFrame(power).rolling(5).apply(
lambda x: x.argmax()
)
return argmax.rank(pct=True) - 0.5
@staticmethod
def alpha_002(close, open_price, volume):
"""Alpha#002"""
log_vol = np.log(volume)
delta_vol = log_vol.diff(2)
price_change = (close - open_price) / open_price
corr = delta_vol.rolling(6).corr(price_change)
return -1 * corr.rank(pct=True)
3. 自动因子挖掘(Tsfresh)
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
def auto_extract_features(price_data):
"""自动生成5000+特征"""
df = price_data.reset_index()
df['id'] = 1
features = extract_features(
df,
column_id='id',
column_sort='date',
default_fc_parameters=EfficientFCParameters()
)
# 筛选有效因子
from tsfresh import select_features
relevant_features = select_features(
features,
forward_returns,
fdr_level=0.05 # FDR控制
)
return relevant_features
4. 因子组合策略
class FactorCombiner:
"""因子组合"""
def __init__(self, method='ic_weight'):
self.method = method
self.weights = None
def fit(self, factors, returns):
if self.method == 'ic_weight':
# IC加权
ics = [abs(factors[col].corr(returns)) for col in factors]
self.weights = np.array(ics) / sum(ics)
elif self.method == 'max_sharpe':
# 最大夏普
from scipy.optimize import minimize
def neg_sharpe(w):
port_ret = (factors * w).sum(axis=1)
return -port_ret.mean() / port_ret.std()
result = minimize(neg_sharpe, np.ones(len(factors.columns))/len(factors.columns))
self.weights = result.x
return self
def transform(self, factors):
return (factors * self.weights).sum(axis=1)
5. 因子筛选标准
| 指标 | 优秀标准 | 说明 |
|---|
| IC | > 0.05 | 预测能力 |
| ICIR | > 0.5 | IC稳定性 |
| 多空收益 | > 5% | 盈利能力 |
| 覆盖率 | > 80% | 适用范围 |
| 换手率 | < 100% | 交易成本 |
6. 避免过拟合
def cross_validate_factor(factor_func, data, n_splits=5):
"""交叉验证因子"""
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=n_splits)
ic_scores = []
for train_idx, test_idx in tscv.split(data):
train = data.iloc[train_idx]
test = data.iloc[test_idx]
# 在训练集计算因子
factor = factor_func(train)
# 在测试集评估
test_factor = factor_func(test)
test_returns = test['close'].pct_change().shift(-1)
ic = test_factor.corr(test_returns)
ic_scores.append(ic)
print(f"平均IC: {np.mean(ic_scores):.4f}")
print(f"IC标准差: {np.std(ic_scores):.4f}")
return ic_scores
核心建议:
- IC > 0.05 的因子才值得使用
- 多因子组合优于单因子
- 必须做交叉验证避免过拟合
- 定期检查因子衰减情况