我有一个大的数据集,我必须转换成.csv格式,由29列和1M+行组成.我认为,随着数据帧变得越来越大,向其添加任何行都会越来越耗时.我想知道是否有更快的方法,分享代码中的相关片段.
不过,任何建议都是受欢迎的.
df = DataFrame()
for startID in range(0, 100000, 1000):
s1 = time.time()
tempdf = DataFrame()
url = f'https://******/products?startId={startID}&size=1000'
r = requests.get(url, headers={'****-Token': 'xxxxxx', 'Merchant-Id': '****'})
jsonList = r.json() # datatype= list, contains= dict
normalized = json_normalize(jsonList)
# type(normal) = pandas.DataFrame
print(startID / 1000) # status indicator
for series in normalized.iterrows():
series = series[1] # iterrows returns tuple (index, series)
offers = series['offers']
series = series.drop(columns='offers')
length = len(offers)
for offer in offers:
n = json_normalize(offer).squeeze() # squeeze() casts DataFrame into Series
concatinated = concat([series, n]).to_frame().transpose()
tempdf = tempdf.append(concatinated, ignore_index=True)
del normalized
df = df.append(tempdf)
f1 = time.time()
print(f1 - s1, ' seconds')
df.to_csv('out.csv')