SOLVED:
最快的功能:比原来的功能快995倍
def add_range_index_stack2(data, range_str):
range_str = _range_format(range_str)
df_range_index = (
data.group_by_dynamic(index_column="date", every=range_str, by="symbol")
.agg()
.with_columns(
pl.int_range(0, pl.len()).over("symbol").alias("range_index")
)
)
data = data.join_asof(df_range_index, on="date", by="symbol")
return data
OG Question:
数据逻辑:
我有一个需要分成块的时间序列.
让我们假设这个帖子需要被分成3块.我使用的数据是股票行情数据和每日价格.如果时间序列数据的长度是3个月,而‘拆分’范围是1个月,那么应该有3个数据块,每个月都标有递增的整数.因此,时间序列中应该有3个部分,所有部分都在一个数据帧中.应该有一个名为range_index
的列,它从0开始,迭代到2.例如,如果数据是1月至3月的数据,则每个价格报价都应该标记为0、1或2.1月的价格报价为1,2月份的价格报价为2.
我希望为数据框中的每个符号执行此操作.每个符号的start_date
可能不相同,因此它应该以这种方式健壮,并基于symbol
个股票数据正确地分配range_index
个值.
我做了什么:
我已经使用极逻辑构建了一个函数,可以在这个数据框上添加一列,但我认为有可能有更快的方法来实现这一点.当我用几年的数据添加几个符号时,它的执行速度会减慢到大约3秒.
我很想知道如何加快功能的建议,甚至是一种新颖的方法.我知道基于行的操作在polar中比在column中慢.如果有任何极地书呆子在那里看到一个明显的问题.救命啊!
def add_range_index(
data: pl.LazyFrame | pl.DataFrame, range_str: str
) -> pl.LazyFrame | pl.DataFrame:
"""
Context: Toolbox || Category: Helpers || Sub-Category: Mandelbrot Channel Helpers || **Command: add_n_range**.
This function is used to add a column to the dataframe that contains the
range grouping for the entire time series.
This function is used in `log_mean()`
""" # noqa: W505
range_str = _range_format(range_str)
if "date" in data.columns:
group_by_args = {
"every": range_str,
"closed": "left",
"include_boundaries": True,
}
if "symbol" in data.columns:
group_by_args["by"] = "symbol"
symbols = (data.select("symbol").unique().count())["symbol"][0]
grouped_data = (
data.lazy()
.set_sorted("date")
.group_by_dynamic("date", **group_by_args)
.agg(
pl.col("adj_close").count().alias("n_obs")
) # using 'adj_close' as the column to sum
)
range_row = grouped_data.with_columns(
pl.arange(0, pl.count()).over("symbol").alias("range_index")
)
## WIP:
# Extract the number of ranges the time series has
# Initialize a new column to store the range index
data = data.with_columns(pl.lit(None).alias("range_index"))
# Loop through each range and add the range index to the original dataframe
for row in range_row.collect().to_dicts():
symbol = row["symbol"]
start_date = row["_lower_boundary"]
end_date = row["_upper_boundary"]
range_index = row["range_index"]
# Apply the conditional logic to each group defined by the 'symbol' column
data = data.with_columns(
pl.when(
(pl.col("date") >= start_date)
& (pl.col("date") < end_date)
& (pl.col("symbol") == symbol)
)
.then(range_index)
.otherwise(pl.col("range_index"))
.over("symbol") # Apply the logic over each 'symbol' group
.alias("range_index")
)
return data
def _range_format(range_str: str) -> str:
"""
Context: Toolbox || Category: Technical || Sub-Category: Mandelbrot Channel Helpers || **Command: _range_format**.
This function formats a range string into a standard format.
The return value is to be passed to `_range_days()`.
Parameters
----------
range_str : str
The range string to format. It should contain a number followed by a
range part. The range part can be 'day', 'week', 'month', 'quarter', or
'year'. The range part can be in singular or plural form and can be
abbreviated. For example, '2 weeks', '2week', '2wks', '2wk', '2w' are
all valid.
Returns
-------
str
The formatted range string. The number is followed by an abbreviation of
the range part ('d' for day, 'w' for week, 'mo' for month, 'q' for
quarter, 'y' for year). For example, '2 weeks' is formatted as '2w'.
Raises
------
RangeFormatError
If an invalid range part is provided.
Notes
-----
This function is used in `log_mean()`
""" # noqa: W505
# Separate the number and range part
num = "".join(filter(str.isdigit, range_str))
# Find the first character after the number in the range string
range_part = next((char for char in range_str if char.isalpha()), None)
# Check if the range part is a valid abbreviation
if range_part not in {"d", "w", "m", "y", "q"}:
msg = f"`{range_str}` could not be formatted; needs to include d, w, m, y, q"
raise HumblDataError(msg)
# If the range part is "m", replace it with "mo" to represent "month"
if range_part == "m":
range_part = "mo"
# Return the formatted range string
return num + range_part
预期数据表单:
PCT股票代码也是如此.