我想删除重复与公共子串,公共子串是最短的字符串在rame.
我try 下面的代码和它的工作,如预期,但我想改进代码.在groupby之后,滤波器逻辑增加了reformat
函数.
import re
import pandas as pd
import tabulate
def dumpdf(df):
s = tabulate.tabulate(df, tablefmt='plain', headers='keys', showindex=True)
print(s)
return
def reformat(df):
dfg = df.groupby('name')
flag = False
for name,df in dfg:
item = []
df = df.sort_values(by="package", key=lambda x: x.str.len())
data = []
for idx,row in df.iterrows():
pkg = row['package']
df.loc[df['package'].str.startswith(pkg, na=False), 'package'] = pkg
df = df[df.duplicated(['package'], keep='first') == False]
df = df.reset_index(drop=True)
if len(df) > 0:
if flag == False:
flag = True
out = df
else:
out = pd.concat([out,df],ignore_index=True)
return out
def main():
data = [
['A','com.example'],
['A','com.example.a'],
['A','com.example.b.c'],
['A','com.fun'],
['B','com.demo'],
['B','com.demo.b.c'],
['B','com.fun'],
['B','com.fun.e'],
['B','com.fun.f.g']
]
df = pd.DataFrame(data,columns=['name','package'])
df = reformat(df)
df = df.groupby('name', as_index=False).agg('\n'.join)
dumpdf(df)
return
main()
输出:
name package
0 A com.fun
com.example
1 B com.fun
com.demo