import xgboost as xgb import pandas as pd import numpy as np from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123) # Simple fit-predict xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123) # Classification xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123) # Regression model.fit(X_train, y_train) preds = model.predict(X_test) # Cross validation (Method 1 : Using the xgboost API, it has cv, train, predict which is unlike fit-predict in sklearn) dmatrix = xgb.DMatrix(data=X_train, label=y_train) params_clf={"objective":"binary:logistic","max_depth":4} # Classification parameters params_reg={"objective":"binary:logistic","booster":"gblinear"} # Regression parameters with specified base learners # Regularization parameters: "alpha" for l1, "lambda" for l2, "gamma" for penalty weight for splitting on a node according to tree complexity cv_results = xgb.cv(dtrain=dmatrix, params=params_clf_reg, nfold=4, num_boost_round=10, metrics="error", as_pandas=True, stratified=True, early_stopping_rounds=10, verbose_eval=1) # accuracy_cv = 1 - cv_results['test-error-mean'].iloc[-1] # Train the final model with the best number of boosting rounds best_num_boost_round = len(cv_results) final_model = xgb.train(params = params_clf_reg, dtrain = dmatrix, num_boost_round=best_num_boost_round) # Make predictions on the testing dataset dtest = xgb.DMatrix(X_test) # ,y_test y_pred_prob = final_model.predict(dtest) y_pred_binary = np.round(y_pred_prob) # Convert probabilities to binary predictions accuracy_final = accuracy_score(y_test, y_pred_binary) # Cross validation (Method 2 : Using scikit-learn) from sklearn.model_selection import cross_val_score, StratifiedKFold xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy') from sklearn.model_selection import cross_val_predict y_pred_cv = cross_val_predict(xgb_model, X_test, y=None, cv=cv) accuracy_final = accuracy_score(y_test, y_pred_cv) # GridSearch / RandomizedSearch (HYPERPARAMETER TUNING) from sklearn.model_selection import GridSearchCV, RandomizedSearchCV param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 'n_estimators': [200], 'subsample': np.arange(0.05,1.05,.05)} gbm = xgb.XGBRegressor() tuning_models = Grid_RandomizedSearchCV(estimator=gbm, param_distributions=param_grid, n_iter=25, scoring='neg_mean_squared_error', cv=4, verbose=1) tuning_models.fit(X, y) tuning_models.best_params_ # See the parameters that give the best results # Visualize tree import os os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin' ### MAKE SURE TO INSTALL GRAPHVIZ AND ADD THE INSTALLATION PATH xgb.plot_tree(xg_model, num_trees=0) # , rankdir="LR" for aligning tree sideways from left to right#pip install xgboost from xgboost import XGBClassifier,XGBRegressor #Use classifier or regressor according to your problem model = XGBClassifier() model.fit(X_train, y_train)xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10) params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10} cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3, num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123) # Create a placeholder to store the stock data stock_data_dictionary = {} for stock_name in stock_list: # Get the data df = data.get_data_yahoo(stock_name, start_date, end_date) # Calculate the daily percent change df['daily_pct_change'] = df['Adj Close'].pct_change() # create the predictors predictor_list = [] for r in range(10, 60, 5): df['pct_change_'+str(r)] = df.daily_pct_change.rolling(r).sum() df['std_'+str(r)] = df.daily_pct_change.rolling(r).std() predictor_list.append('pct_change_'+str(r)) predictor_list.append('std_'+str(r)) # Target Variable df['return_next_day'] = df.daily_pct_change.shift(-1) df['actual_signal'] = np.where(df.return_next_day > 0, 1, -1) df = df.dropna() # Add the data to dictionary stock_data_dictionary.update({stock_name: df})