据我所知,没有父类可以像您在np.number
中指出的那样,以通用方式捕获所有的DateTime列.
我的猜测是你想在sklearn
-pipeline
中使用它进行前处理?在这种情况下,您可以使用自定义 Select 器:
# Custom function to check if dtype is a datetime
def is_datetime(dtype):
dtype_str = str(dtype)
# Check for timezone-aware and timezone-naive datetimes
if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
return True
return False
# Custom function to mimic make_column_selector with custom type checks
def custom_column_selector(df, dtype_checker):
return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
Test
这是对虚拟DataFrame
上的定制 Select 器的简单测试,这些虚拟DataFrame
具有不同类型的数据类型,以及具有时区感知和时区初始日期的日期.
# Imports
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.compose import make_column_selector
# Config
set_config(transform_output="pandas")
# Example DataFrame with mixed dtypes and different datetime types
df = pd.DataFrame({
'a': range(10),
'b': pd.to_datetime(range(10), unit='D', origin=pd.Timestamp('2023-01-01')),
'c': pd.to_datetime(range(10), utc=True),
'd': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('America/New_York'),
'e': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('Europe/London'),
'f': ['cat', 'dog']*5,
'g': [1.1, 2.2]*5
})
# Custom function to check if dtype is a datetime
def is_datetime(dtype):
dtype_str = str(dtype)
# Check for timezone-aware and timezone-naive datetimes
if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
return True
return False
# Define the datetime transformer, exchange with whichever you pefer
def extract_year(df):
return df.apply(lambda x: x.dt.year)
datetime_transformer = Pipeline(steps=[
('year', FunctionTransformer(extract_year, validate=False))
])
# Custom column selector function
def custom_column_selector(dtype_checker):
def selector(df):
return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
return selector
# Create preprocessor with transformers
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
('cat', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
('datetime', datetime_transformer, custom_column_selector(is_datetime))
])
# Usage of the preprocessor
data_transformed = preprocessor.fit_transform(df)
After preprocessing:个
num__a |
num__g |
cat__f_cat |
cat__f_dog |
datetime__b |
datetime__c |
datetime__d |
datetime__e |
-1.5667 |
-1 |
1 |
0 |
2023 |
1970 |
2023 |
2023 |
-1.21854 |
1 |
0 |
1 |
2023 |
1970 |
2023 |
2023 |
-0.870388 |
-1 |
1 |
0 |
2023 |
1970 |
2023 |
2023 |
-0.522233 |
1 |
0 |
1 |
2023 |
1970 |
2023 |
2023 |
-0.174078 |
-1 |
1 |
0 |
2023 |
1970 |
2023 |
2023 |
0.174078 |
1 |
0 |
1 |
2023 |
1970 |
2023 |
2023 |
0.522233 |
-1 |
1 |
0 |
2023 |
1970 |
2023 |
2023 |
0.870388 |
1 |
0 |
1 |
2023 |
1970 |
2023 |
2023 |
1.21854 |
-1 |
1 |
0 |
2023 |
1970 |
2023 |
2023 |
1.5667 |
1 |
0 |
1 |
2023 |
1970 |
2023 |
2023 |