one-hot encode categorical variables standardize numerical variables
from sklearn.preprocessing import StandardScaler num_vars = ['pickup_lon', 'pickup_lat', 'dropoff_lon', 'dropoff_lat', 'distance'] cat_vars = ['hour', 'day', 'region'] scaler = StandardScaler() scaler.fit(train[num_vars]) def design_matrix(t): """Create a design matrix from taxi ride dataframe t.""" scaled = t[num_vars].copy() scaled.iloc[:,:] = scaler.transform(scaled) # Convert to standard units categoricals = [pd.get_dummies(t[s], prefix=s, drop_first=True) for s in cat_vars] return pd.concat([scaled] + categoricals, axis=1) design_matrix(train).iloc[0,:]
Source: data100.datahub.berkeley.edu