xgboost python

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)
# Simple fit-predict
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123) # Classification
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123) # Regression
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Cross validation (Method 1 : Using the xgboost API, it has cv, train, predict which is unlike fit-predict in sklearn)
dmatrix = xgb.DMatrix(data=X_train, label=y_train)
params_clf={"objective":"binary:logistic","max_depth":4}        # Classification parameters
params_reg={"objective":"binary:logistic","booster":"gblinear"}  # Regression parameters with specified base learners
# Regularization parameters: "alpha" for l1, "lambda" for l2, "gamma" for penalty weight for splitting on a node according to tree complexity
cv_results = xgb.cv(dtrain=dmatrix, params=params_clf_reg, nfold=4, num_boost_round=10, 
        metrics="error", as_pandas=True, stratified=True, early_stopping_rounds=10, verbose_eval=1)
# accuracy_cv = 1 - cv_results['test-error-mean'].iloc[-1]
# Train the final model with the best number of boosting rounds
best_num_boost_round = len(cv_results)
final_model = xgb.train(params = params_clf_reg, dtrain = dmatrix, num_boost_round=best_num_boost_round)
# Make predictions on the testing dataset
dtest = xgb.DMatrix(X_test) # ,y_test
y_pred_prob = final_model.predict(dtest)
y_pred_binary = np.round(y_pred_prob)  # Convert probabilities to binary predictions
accuracy_final = accuracy_score(y_test, y_pred_binary)

# Cross validation (Method 2 : Using scikit-learn)
from sklearn.model_selection import cross_val_score, StratifiedKFold
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy')
from sklearn.model_selection import cross_val_predict
y_pred_cv = cross_val_predict(xgb_model, X_test, y=None, cv=cv)
accuracy_final = accuracy_score(y_test, y_pred_cv)

# GridSearch / RandomizedSearch (HYPERPARAMETER TUNING)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 'n_estimators': [200], 'subsample': np.arange(0.05,1.05,.05)}
gbm = xgb.XGBRegressor()
tuning_models = Grid_RandomizedSearchCV(estimator=gbm, param_distributions=param_grid, n_iter=25, 
        scoring='neg_mean_squared_error', cv=4, verbose=1)
tuning_models.fit(X, y)
tuning_models.best_params_ # See the parameters that give the best results
# Visualize tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin' ### MAKE SURE TO INSTALL GRAPHVIZ AND ADD THE INSTALLATION PATH
xgb.plot_tree(xg_model, num_trees=0) # , rankdir="LR" for aligning tree sideways from left to right#pip install xgboost
from xgboost import XGBClassifier,XGBRegressor
#Use classifier or regressor according to your problem
model = XGBClassifier()
model.fit(X_train, y_train)xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
# Create a placeholder to store the stock data
stock_data_dictionary = {}
for stock_name in stock_list:
# Get the data
df = data.get_data_yahoo(stock_name, start_date, end_date)
# Calculate the daily percent change
df['daily_pct_change'] = df['Adj Close'].pct_change()
# create the predictors
predictor_list = []
for r in range(10, 60, 5):
df['pct_change_'+str(r)] = df.daily_pct_change.rolling(r).sum()
df['std_'+str(r)] = df.daily_pct_change.rolling(r).std()
predictor_list.append('pct_change_'+str(r))
predictor_list.append('std_'+str(r))
# Target Variable
df['return_next_day'] = df.daily_pct_change.shift(-1)
df['actual_signal'] = np.where(df.return_next_day > 0, 1, -1)
df = df.dropna()
# Add the data to dictionary
stock_data_dictionary.update({stock_name: df})
Python相关代码片段