您需要登录才能建立帖子与主题。

Python 拆分csv文件

import os
import pandas as pd

def file_split(filename, file_num, header=True):
# 根据是否有表头执行不同程序,默认是否表头的
if header:
# 获得每个文件需要有的行数
chunksize = 1000000 # 先初始化的chunksize是100W
data1 = pd.read_csv(open(filename), chunksize=chunksize, sep=',', encoding='utf8')
num = 0
for chunk in data1:
num += len(chunk)
chunksize = round(num / file_num + 1)

# 需要存的file
head, tail = os.path.splitext(filename)
data2 = pd.read_csv(open(filename), chunksize=chunksize, sep=',', encoding='utf8')
i = 0 # 定文件名
for chunk in data2:
chunk.to_csv('{0}_{1}{2}'.format(head, i, tail), header=None, index=False)
print('保存第{0}个数据'.format(i))
i += 1
else:
# 获得每个文件需要有的行数
chunksize = 1000000 # 先初始化的chunksize是100W
data1 = pd.read_csv(open(filename), chunksize=chunksize, header=None, sep=',')
num = 0
for chunk in data1:
num += len(chunk)
chunksize = round(num / file_num + 1)

# 需要存的file
head, tail = os.path.splitext(filename)
data2 = pd.read_csv(open(filename), chunksize=chunksize, header=None, sep=',')
i = 0 # 定文件名
for chunk in data2:
chunk.to_csv('{0}_{1}{2}'.format(head, i, tail), header=None, index=False)
print('保存第{0}个数据'.format(i))
i += 1

if __name__ == '__main__':
rootdir = "C:/file/wu/f"
for parent, dirnames, filenames in os.walk(rootdir): # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
# for dirname in dirnames: # 输出文件夹信息
# print("parent is:" + parent)
# print("dirname is " + dirname)

for filename in filenames: # 输出文件信息
file = os.path.join(parent, filename).replace('\\', '/') # 输出文件路径信息

size = os.path.getsizes(file)
print(size)
filename = file
file_split(filename, 2, header=False) # 2指的是拆分的个数