Last updated
Why Split Large CSV Files?
Large CSV files (millions of rows) can be slow to open in Excel, exceed API upload limits, or overwhelm database import tools. Splitting them into smaller chunks makes processing parallel and manageable. Common split strategies: by row count, by file size, or by a column value.
Splitting by Row Count
Python
import csv, os
def split_csv(input_file, rows_per_chunk=10000, output_dir='.'):
os.makedirs(output_dir, exist_ok=True)
base = os.path.splitext(os.path.basename(input_file))[0]
with open(input_file, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
chunk_num = 1
rows = []
for row in reader:
rows.append(row)
if len(rows) >= rows_per_chunk:
write_chunk(header, rows, output_dir, base, chunk_num)
chunk_num += 1
rows = []
if rows: # write remaining rows
write_chunk(header, rows, output_dir, base, chunk_num)
def write_chunk(header, rows, output_dir, base, num):
path = os.path.join(output_dir, f"{base}_part{num:03d}.csv")
with open(path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(rows)
print(f"Written: {path} ({len(rows)} rows)")
split_csv('large_dataset.csv', rows_per_chunk=50000)
Splitting by Column Value
Python
import pandas as pd
df = pd.read_csv('sales.csv')
# Split by year
for year, group in df.groupby(df['date'].str[:4]):
group.to_csv(f'sales_{year}.csv', index=False)
print(f"Written sales_{year}.csv: {len(group)} rows")