我需要找到Polar数据帧中的列和输入值之间的相似性.我使用的是jaro_winkler_metri值.我在做的时候遇到了错误.我们不想使用UDF函数,因为它会减慢进程.
import polars as pl
import jaro
def test_polars():
name='savah'
data = {"first_name": ['sarah', 'purnima'], "last_name": ['vats', 'malik']}
df = pl.DataFrame(data)
print(df)
df = df.with_columns(
[
(pl.when( jaro.jaro_winkler_metric(pl.col("first_name"), name) >= 0.8
).then(1).otherwise(0)).alias("COMP80_FN"),
]
)
print(df)
if __name__ == '__main__':
test_polars()
C:\PythonProject\pythonProject\venv\Graph_POC\Scripts\python.exe "C:\PythonProject\pythonProject\polars data.py"
shape: (2, 2)
┌────────────┬───────────┐
│ first_name ┆ last_name │
│ --- ┆ --- │
│ str ┆ str │
╞════════════╪═══════════╡
│ sarah ┆ vats │
│ purnima ┆ malik │
└────────────┴───────────┘
Traceback (most recent call last):
File "C:\PythonProject\pythonProject\polars data.py", line 22, in <module>
test_polars()
File "C:\PythonProject\pythonProject\polars data.py", line 12, in test_polars
(pl.when( jaro.jaro_winkler_metric(pl.col("first_name"), name) >= 0.8
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\jaro\__init__.py", line 43, in jaro_winkler_metric
return jaro.metric_jaro_winkler(string1, string2)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\jaro\jaro.py", line 235, in metric_jaro_winkler
ans = string_metrics(string1, string2,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\jaro\jaro.py", line 159, in string_metrics
assert isinstance(s1, str)
AssertionError
Process finished with exit code 1