Python structure for machine learning experiments

Here we will present the setup for single machine to run time consuming machine learning experiments like feature selection using different machine learning models.
First we will create python program which runs single experiment.
We will use argparse library to be able to specify experiment parameters such as target variable, machine learning models to run and number of hyper parameter optimisation iterations, among others.
We will use logging module to log everything into one large text file.

The data for training will be read from the pickle files, as this is the fastest way to read the data.
The data will be created by an external program and then dataframe pickled to disk.

The structure of this python the following ,with example variables to include:

from sklearn.pipeline import Pipeline
from ci.fs import StandardScaler

import argparse
my_parser = argparse.ArgumentParser(description='run classification models on df with features')
my_parser.add_argument('-resample','-r',metavar='resample',type=str,help='resample 1Min 30S 5Min', dest="resample", default='30S')
my_parser.add_argument('-y',metavar='y',type=str,help='y = ybs ybb ycb ycs',dest="y", default='ycb')
my_parser.add_argument('-xs',metavar='xs',type=str,help='xs = all raw q3 q95 filename ',dest="xs", default='all')
my_parser.add_argument('-xsraw',metavar='xsraw',nargs='+',help='xsraw features',dest="xsraw", default=['m','amb','nbp','dpinsell','qbp','dpin','qsp','qimb','signimb','hml','vimb1','vimb5','cimb1','cimb5','vimb1000','cimb1000','vwap1','r'])


my_parser.add_argument('-models','--m', nargs='+', help='models to run = log lin xgb nn plog plin pn',dest="models", default=['log'])
my_parser.add_argument('-optiter',metavar='o',type=int,help='n_iter in CVrandsearch for all models',dest="optiter", default=100)

my_parser.add_argument('-test','--t',dest="test", default=False,action='store_true')
my_parser.add_argument('-addlog',dest="addlog", default=False,action='store_true')
my_parser.add_argument('-interact','--i',help='True is use interact features',dest="interact", default=False,action='store_true')
my_parser.add_argument('-scale',help='True to scale via pipeline ',dest="scale", default=False,action='store_true')


my_parser.add_argument('-gap',metavar='o',type=int,help='cv gap',dest="gap", default=100)
my_parser.add_argument('-max_train_size',metavar='o',type=int,help='cv max train size',dest="max_train_size", default=2000)

args = my_parser.parse_args()
logging.info(f"START - args={args}")
resample = args.resample
optiter=args.optiter
models=args.models
y = args.y
xs=args.xs
xsraw=args.xsraw
interact=args.interact
addlog=args.addlog
scale=args.scale

dffilename='dfbt'+resample+'f.pkl'#'dfbt30sf.pkl' dfbt30Sfandinter.pkl
pd.set_option("display.precision", 3)

df=pd.read_pickle(dffilename).ffill()
print(f"dfcols={list(df.columns)}")
#keep only float columns,  not time
#ipdb.set_trace()
df=df.select_dtypes(include=[np.float,np.int,np.int64,np.int32])
print(f"only floats={list(df.columns)}")

#linear regression ys
yrs=getfeaturenames('y',df.columns)
yrs=[yr for yr in yrs if df[yr].nunique()>10]
print(f"yrs={yrs}") 

xsraw=getfeaturenames('raw',df.columns,xsraw=xsraw)
xsq3=getfeaturenames('q3',df.columns,xsraw=xsraw)
xsq95=getfeaturenames('q95',df.columns,xsraw=xsraw)


xsall=getfeaturenames('all',df.columns)
xsinteract=getfeaturenames('interact',df.columns)

xs={'all':xsall,'raw':xsraw,'q3':xsq3,'q95':xsq95}[xs]

if addlog:
    _,lognames=df.addlog(cols=xs,inplace=True,retfnames=True)
    xs+=lognames

if interact:
    xs.extend(xsinteract)

if 'xgbc' in models or 'pxgbc' in models:#False:

    params={'scale_pos_weight': 100, 'n_estimators': 30, 'max_depth': 5, 'max_delta_step': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8, 'base_score': 0.1, 'alpha': 1}
    fixedparams=dict(objective ='binary:logistic')
    model=xgb.XGBClassifier(**fixedparams,**params)
    params = {
            'n_estimators':[10,100,200],
            'colsample_bytree': [ 0.8, 1.0],
            'max_depth': [5,10],
            'learning_rate':[0.01,0.1,1],
            'alpha':[1,10,100],
            'scale_pos_weight':[1,10,100],
            'base_score':[0.1,0.9],
            'max_delta_step':[0,1,10]
            }
    randcv = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter, scoring='f1', n_jobs=1, cv=cv, verbose=0, random_state=1).fit(dftrain[xs], dftrain[y])
    logging.info(f"rscv {model.__class__.__name__} fixedparams={fixedparams} bestscore={randcv.best_score_} bestparams={randcv.best_params_} \n{DF(randcv.cv_results_).sort_values(by='mean_test_score',ascending=False)[['mean_test_score' ,'std_test_score', 'params']]}")
    logging.debug(f"rscv \n{DF(randcv.cv_results_).sort_values(by='mean_test_score',ascending=False)}")
    xgbc=xgb.XGBClassifier(**fixedparams,**randcv.best_params_)


def getmodel(modelname):
    if modelname[0]=='p':
        return Pipeline([("pipe0",StandardScaler()),("pipe1",eval(modelname[1:]))])
    else:
        return eval(modelname)

for mstr in models:
    runselector(dftrain,y=y,xs=xs,model=getmodel(mstr),nansy='.fillna(0)',nansx=None,verbose=2,methods=['sfsb','sfsf','rfe','abscoef'],dftest=dftest,scoring='f1',eval_metric='f1',cv=cv)  #,

where we would use runselector function from the feature selection post.

We would then run this python file using windows bat files as following:


call python runexp.py -resample 30S -y ybb -xs raw  -models plog pxgbc
call python runexp.py -resample 30S -y ybs -xs raw  -models plog pxgbc
call python runexp.py -resample 30S -y ycb -xs raw  -models plog pxgbc

Posted in machine learning Tagged with: ,