前言:电子表格自动化的重要性

Excel和CSV文件是商业环境中使用最广泛的数据格式。每天都有大量数据以电子表格形式被创建、处理和报告。使用Python自动化这些重复性工作可以节省时间并减少人为错误。

在第5篇中,我们将学习如何使用Python的csv模块、pandas和openpyxl库高效处理CSV和Excel文件。从基本的文件读写到单元格样式设置、图表创建、大文件处理,我们将介绍可以立即在实际工作中应用的技术。

1. CSV文件处理基础(csv模块)

Python内置的csv模块可以在不安装任何额外包的情况下处理CSV文件。

1.1 读取CSV文件

import csv

# 基本CSV读取
with open('data.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        print(row)  # 每行以列表形式返回

# 以字典形式读取(基于表头)
with open('data.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        print(row['姓名'], row['年龄'])  # 通过列名访问

# 指定分隔符
with open('data.tsv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file, delimiter='\t')  # 制表符分隔
    for row in reader:
        print(row)

1.2 写入CSV文件

import csv

# 基本CSV写入
data = [
    ['姓名', '年龄', '城市'],
    ['张三', 25, '北京'],
    ['李四', 30, '上海'],
    ['王五', 28, '广州']
]

with open('output.csv', 'w', encoding='utf-8-sig', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)  # 一次写入多行

# 以字典形式写入
data_dict = [
    {'姓名': '张三', '年龄': 25, '城市': '北京'},
    {'姓名': '李四', '年龄': 30, '城市': '上海'},
    {'姓名': '王五', '年龄': 28, '城市': '广州'}
]

with open('output_dict.csv', 'w', encoding='utf-8-sig', newline='') as file:
    fieldnames = ['姓名', '年龄', '城市']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()  # 写入表头
    writer.writerows(data_dict)
提示:在Windows上用Excel打开CSV文件时,为防止中文乱码,请使用encoding='utf-8-sig'。添加BOM(字节顺序标记)后,Excel可以正确识别编码。

2. 使用pandas处理CSV

pandas是一个专为数据分析优化的库,为CSV文件处理提供了强大的功能。

2.1 pandas安装与基本使用

# 安装
pip install pandas
import pandas as pd

# 读取CSV
df = pd.read_csv('data.csv', encoding='utf-8')

# 查看基本信息
print(df.head())        # 前5行
print(df.info())        # 数据类型和缺失值信息
print(df.describe())    # 统计摘要

# 选择特定列
print(df['姓名'])           # 单列
print(df[['姓名', '年龄']])  # 多列

# 条件筛选
adults = df[df['年龄'] >= 20]
beijing_people = df[df['城市'] == '北京']

2.2 数据预处理与加工

import pandas as pd

df = pd.read_csv('sales_data.csv')

# 处理缺失值
df.fillna(0, inplace=True)              # 用0填充
df.dropna(subset=['必填列'], inplace=True)  # 删除特定列有缺失值的行

# 添加新列
df['总金额'] = df['单价'] * df['数量']
df['折扣价'] = df['总金额'] * 0.9

# 重命名列
df.rename(columns={'old_name': 'new_name'}, inplace=True)

# 数据排序
df.sort_values(by='总金额', ascending=False, inplace=True)

# 按组聚合
summary = df.groupby('类别').agg({
    '总金额': 'sum',
    '数量': 'mean',
    '订单号': 'count'
}).rename(columns={'订单号': '订单数'})

# 保存为CSV
df.to_csv('processed_data.csv', index=False, encoding='utf-8-sig')

2.3 合并多个CSV文件

import pandas as pd
import glob

# 合并相同格式的多个CSV文件
csv_files = glob.glob('data/*.csv')
df_list = [pd.read_csv(f) for f in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# 合并两个数据框(JOIN)
df1 = pd.read_csv('customers.csv')
df2 = pd.read_csv('orders.csv')

# 内连接(只保留两边都有的)
merged = pd.merge(df1, df2, on='客户ID', how='inner')

# 外连接(保留所有数据)
merged_all = pd.merge(df1, df2, on='客户ID', how='outer')

3. 使用openpyxl创建/修改Excel文件

openpyxl是一个专门用于创建和修改Excel文件(.xlsx)的库。

3.1 安装与基本使用

# 安装
pip install openpyxl
from openpyxl import Workbook, load_workbook

# 创建新工作簿
wb = Workbook()
ws = wb.active
ws.title = "销售数据"

# 输入数据
ws['A1'] = '产品名称'
ws['B1'] = '数量'
ws['C1'] = '单价'

# 添加多行
data = [
    ['笔记本电脑', 10, 1200000],
    ['鼠标', 50, 25000],
    ['键盘', 30, 45000]
]
for row in data:
    ws.append(row)

# 保存
wb.save('sales.xlsx')

# 打开现有文件
wb = load_workbook('sales.xlsx')
ws = wb.active

# 读取单元格值
print(ws['A1'].value)
print(ws.cell(row=2, column=1).value)

# 读取范围
for row in ws.iter_rows(min_row=2, max_row=4, min_col=1, max_col=3):
    for cell in row:
        print(cell.value, end=' ')
    print()

3.2 单元格样式(字体、颜色、边框)

from openpyxl import Workbook
from openpyxl.styles import Font, Fill, PatternFill, Border, Side, Alignment

wb = Workbook()
ws = wb.active

# 输入数据
headers = ['产品名称', '数量', '单价', '总金额']
for col, header in enumerate(headers, 1):
    ws.cell(row=1, column=col, value=header)

# 字体样式
bold_font = Font(bold=True, size=12, color='FFFFFF')
normal_font = Font(size=11)

# 背景色
header_fill = PatternFill(start_color='4472C4', end_color='4472C4', fill_type='solid')
alt_fill = PatternFill(start_color='D9E2F3', end_color='D9E2F3', fill_type='solid')

# 边框
thin_border = Border(
    left=Side(style='thin'),
    right=Side(style='thin'),
    top=Side(style='thin'),
    bottom=Side(style='thin')
)

# 对齐
center_align = Alignment(horizontal='center', vertical='center')
right_align = Alignment(horizontal='right')

# 应用表头样式
for col in range(1, 5):
    cell = ws.cell(row=1, column=col)
    cell.font = bold_font
    cell.fill = header_fill
    cell.border = thin_border
    cell.alignment = center_align

# 数据行样式
data = [
    ['笔记本电脑', 10, 1200000],
    ['鼠标', 50, 25000],
    ['键盘', 30, 45000]
]

for row_idx, row_data in enumerate(data, 2):
    for col_idx, value in enumerate(row_data, 1):
        cell = ws.cell(row=row_idx, column=col_idx, value=value)
        cell.border = thin_border
        if row_idx % 2 == 0:
            cell.fill = alt_fill

# 调整列宽
ws.column_dimensions['A'].width = 15
ws.column_dimensions['B'].width = 10
ws.column_dimensions['C'].width = 15
ws.column_dimensions['D'].width = 15

# 调整行高
ws.row_dimensions[1].height = 25

wb.save('styled_sales.xlsx')

3.3 插入公式

from openpyxl import Workbook

wb = Workbook()
ws = wb.active

# 输入数据
ws['A1'] = '数量'
ws['B1'] = '单价'
ws['C1'] = '总金额'

data = [
    [10, 1200000],
    [50, 25000],
    [30, 45000]
]

for row_idx, (qty, price) in enumerate(data, 2):
    ws.cell(row=row_idx, column=1, value=qty)
    ws.cell(row=row_idx, column=2, value=price)
    # 插入公式(总金额 = 数量 * 单价)
    ws.cell(row=row_idx, column=3, value=f'=A{row_idx}*B{row_idx}')

# 合计公式
last_row = len(data) + 1
ws.cell(row=last_row + 1, column=1, value='合计')
ws.cell(row=last_row + 1, column=3, value=f'=SUM(C2:C{last_row})')

# 条件公式
ws['D1'] = '等级'
for row_idx in range(2, last_row + 1):
    ws.cell(row=row_idx, column=4,
            value=f'=IF(C{row_idx}>=10000000,"A",IF(C{row_idx}>=1000000,"B","C"))')

wb.save('formulas.xlsx')

4. 创建图表

使用openpyxl可以通过编程方式在Excel中创建各种图表。

from openpyxl import Workbook
from openpyxl.chart import BarChart, LineChart, PieChart, Reference

wb = Workbook()
ws = wb.active

# 输入数据
data = [
    ['月份', '销售额', '成本', '利润'],
    ['1月', 5000000, 3000000, 2000000],
    ['2月', 6000000, 3500000, 2500000],
    ['3月', 5500000, 3200000, 2300000],
    ['4月', 7000000, 4000000, 3000000],
    ['5月', 8000000, 4500000, 3500000],
    ['6月', 7500000, 4200000, 3300000]
]

for row in data:
    ws.append(row)

# 柱状图
bar_chart = BarChart()
bar_chart.type = "col"  # 垂直柱状图
bar_chart.style = 10
bar_chart.title = "月度销售情况"
bar_chart.y_axis.title = "金额(元)"
bar_chart.x_axis.title = "月份"

# 指定数据范围
data_ref = Reference(ws, min_col=2, min_row=1, max_col=4, max_row=7)
categories = Reference(ws, min_col=1, min_row=2, max_row=7)

bar_chart.add_data(data_ref, titles_from_data=True)
bar_chart.set_categories(categories)
bar_chart.shape = 4  # 图表样式

ws.add_chart(bar_chart, "F2")  # 图表位置

# 折线图
line_chart = LineChart()
line_chart.title = "月度趋势"
line_chart.style = 13
line_chart.y_axis.title = "金额"
line_chart.x_axis.title = "月份"

line_chart.add_data(data_ref, titles_from_data=True)
line_chart.set_categories(categories)

ws.add_chart(line_chart, "F18")

# 饼图(仅6月数据)
pie_chart = PieChart()
pie_chart.title = "6月成本结构"

labels = Reference(ws, min_col=2, max_col=4, min_row=1)
pie_data = Reference(ws, min_col=2, max_col=4, min_row=7)

pie_chart.add_data(pie_data, titles_from_data=False)
pie_chart.set_categories(labels)

ws.add_chart(pie_chart, "N2")

wb.save('charts.xlsx')

5. 处理多个工作表

from openpyxl import Workbook, load_workbook

# 创建包含多个工作表的新工作簿
wb = Workbook()

# 第一个工作表(默认工作表)
ws1 = wb.active
ws1.title = "摘要"

# 创建额外的工作表
ws2 = wb.create_sheet("第一季度")
ws3 = wb.create_sheet("第二季度")
ws4 = wb.create_sheet("第三季度", 0)  # 插入到最前面

# 每个工作表输入数据
quarters = {
    '第一季度': [['1月', 100], ['2月', 120], ['3月', 110]],
    '第二季度': [['4月', 130], ['5月', 140], ['6月', 135]],
    '第三季度': [['7月', 150], ['8月', 160], ['9月', 155]]
}

for sheet_name, data in quarters.items():
    ws = wb[sheet_name]
    ws.append(['月份', '销售额'])
    for row in data:
        ws.append(row)

# 在摘要工作表中引用各季度合计
summary = wb['摘要']
summary.append(['季度', '总销售额'])
summary.append(['第一季度', '=SUM(\'第一季度\'!B2:B4)'])
summary.append(['第二季度', '=SUM(\'第二季度\'!B2:B4)'])
summary.append(['第三季度', '=SUM(\'第三季度\'!B2:B4)'])

# 更改工作表顺序
wb._sheets = [wb['摘要'], wb['第一季度'], wb['第二季度'], wb['第三季度']]

# 复制工作表
source = wb['第一季度']
target = wb.copy_worksheet(source)
target.title = "第一季度_备份"

# 删除工作表
# del wb['第一季度_备份']

wb.save('multi_sheets.xlsx')

# 遍历现有文件的所有工作表
wb = load_workbook('multi_sheets.xlsx')
for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]
    print(f"工作表: {sheet_name}")
    for row in ws.iter_rows(values_only=True):
        print(row)

6. 合并Excel文件

import pandas as pd
from openpyxl import load_workbook
import glob

# 使用pandas合并Excel文件
def merge_excel_files(file_pattern, output_file):
    """将多个Excel文件合并为一个"""
    excel_files = glob.glob(file_pattern)
    df_list = []

    for file in excel_files:
        df = pd.read_excel(file)
        df['源文件'] = file  # 标记来源
        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_excel(output_file, index=False)
    print(f"合并完成: {len(excel_files)}个文件 -> {output_file}")

merge_excel_files('reports/*.xlsx', 'combined_report.xlsx')

# 将多个工作表合并为一个工作表
def merge_sheets(input_file, output_file):
    """将一个Excel文件中的所有工作表合并为一个"""
    xl = pd.ExcelFile(input_file)
    df_list = []

    for sheet_name in xl.sheet_names:
        df = pd.read_excel(xl, sheet_name=sheet_name)
        df['工作表名'] = sheet_name
        df_list.append(df)

    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df.to_excel(output_file, index=False)

merge_sheets('multi_sheets.xlsx', 'merged_single_sheet.xlsx')

# 条件合并(基于特定列)
def conditional_merge(files, key_column, output_file):
    """基于特定键列合并多个文件"""
    base_df = pd.read_excel(files[0])

    for file in files[1:]:
        df = pd.read_excel(file)
        base_df = pd.merge(base_df, df, on=key_column, how='outer')

    base_df.to_excel(output_file, index=False)

conditional_merge(['customers.xlsx', 'orders.xlsx', 'payments.xlsx'],
                  '客户ID', 'customer_full_data.xlsx')

7. 大文件处理(分块)

处理数十万行以上的大文件时,需要考虑内存效率。

import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows

# 分块处理大型CSV
def process_large_csv(input_file, output_file, chunk_size=10000):
    """分块处理大型CSV"""
    processed_chunks = []

    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        # 处理每个块
        chunk['已处理'] = True
        chunk['总金额'] = chunk['数量'] * chunk['单价']
        processed_chunks.append(chunk)
        print(f"已处理行数: {len(chunk)}")

    # 保存结果
    result_df = pd.concat(processed_chunks, ignore_index=True)
    result_df.to_csv(output_file, index=False)

# 分块筛选并保存(节省内存)
def filter_large_csv(input_file, output_file, condition_func, chunk_size=10000):
    """从大型CSV中仅提取符合条件的行"""
    first_chunk = True

    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        filtered = chunk[condition_func(chunk)]

        if first_chunk:
            filtered.to_csv(output_file, index=False, mode='w')
            first_chunk = False
        else:
            filtered.to_csv(output_file, index=False, mode='a', header=False)

# 使用示例:仅提取总金额100万元以上的行
filter_large_csv(
    'large_sales.csv',
    'high_value_sales.csv',
    lambda df: df['总金额'] >= 1000000
)

# 读取大型Excel文件(read_only模式)
def read_large_excel(file_path):
    """高效读取大型Excel文件"""
    wb = load_workbook(file_path, read_only=True)
    ws = wb.active

    for row in ws.iter_rows(values_only=True):
        yield row  # 作为生成器逐行返回

    wb.close()

# 写入大型Excel文件(write_only模式)
from openpyxl import Workbook

def write_large_excel(data_generator, output_file):
    """高效写入大量数据到Excel"""
    wb = Workbook(write_only=True)
    ws = wb.create_sheet()

    for row in data_generator:
        ws.append(row)

    wb.save(output_file)

8. 实战:自动生成月度报表

现在让我们综合所学内容,创建一个可以在实际工作中使用的月度销售报表自动生成系统。

import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from openpyxl.chart import BarChart, LineChart, PieChart, Reference
from openpyxl.utils.dataframe import dataframe_to_rows
from datetime import datetime
import os

class MonthlyReportGenerator:
    """月度销售报表自动生成类"""

    def __init__(self, data_file, output_dir='reports'):
        self.data_file = data_file
        self.output_dir = output_dir
        self.df = None
        self.wb = None

        # 创建输出目录
        os.makedirs(output_dir, exist_ok=True)

        # 定义样式
        self.header_font = Font(bold=True, size=11, color='FFFFFF')
        self.header_fill = PatternFill(start_color='2F5496', end_color='2F5496', fill_type='solid')
        self.border = Border(
            left=Side(style='thin'),
            right=Side(style='thin'),
            top=Side(style='thin'),
            bottom=Side(style='thin')
        )
        self.center_align = Alignment(horizontal='center', vertical='center')
        self.number_format = '#,##0'
        self.currency_format = '#,##0"元"'

    def load_data(self):
        """加载数据"""
        self.df = pd.read_csv(self.data_file, parse_dates=['订单日期'])
        self.df['月份'] = self.df['订单日期'].dt.to_period('M')
        print(f"数据加载完成: {len(self.df)}行")

    def create_summary_sheet(self, ws):
        """创建摘要工作表"""
        ws.title = "摘要"

        # 标题
        ws['A1'] = '月度销售报表'
        ws['A1'].font = Font(bold=True, size=16)
        ws.merge_cells('A1:E1')

        # 生成时间
        ws['A2'] = f'生成时间: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'
        ws['A2'].font = Font(italic=True, size=10)

        # 月度汇总数据
        monthly_summary = self.df.groupby('月份').agg({
            '订单号': 'count',
            '数量': 'sum',
            '总金额': 'sum'
        }).reset_index()
        monthly_summary.columns = ['月份', '订单数', '销售数量', '总销售额']

        # 表格起始位置
        start_row = 4
        headers = ['月份', '订单数', '销售数量', '总销售额']

        # 写入表头
        for col, header in enumerate(headers, 1):
            cell = ws.cell(row=start_row, column=col, value=header)
            cell.font = self.header_font
            cell.fill = self.header_fill
            cell.border = self.border
            cell.alignment = self.center_align

        # 写入数据
        for row_idx, row in monthly_summary.iterrows():
            ws.cell(row=start_row + row_idx + 1, column=1, value=str(row['月份']))
            ws.cell(row=start_row + row_idx + 1, column=2, value=row['订单数'])
            ws.cell(row=start_row + row_idx + 1, column=3, value=row['销售数量'])
            cell = ws.cell(row=start_row + row_idx + 1, column=4, value=row['总销售额'])
            cell.number_format = self.currency_format

            for col in range(1, 5):
                ws.cell(row=start_row + row_idx + 1, column=col).border = self.border

        # 合计行
        total_row = start_row + len(monthly_summary) + 1
        ws.cell(row=total_row, column=1, value='合计').font = Font(bold=True)
        ws.cell(row=total_row, column=2, value=f'=SUM(B{start_row+1}:B{total_row-1})')
        ws.cell(row=total_row, column=3, value=f'=SUM(C{start_row+1}:C{total_row-1})')
        ws.cell(row=total_row, column=4, value=f'=SUM(D{start_row+1}:D{total_row-1})')
        ws.cell(row=total_row, column=4).number_format = self.currency_format

        for col in range(1, 5):
            ws.cell(row=total_row, column=col).border = self.border

        # 调整列宽
        ws.column_dimensions['A'].width = 12
        ws.column_dimensions['B'].width = 12
        ws.column_dimensions['C'].width = 12
        ws.column_dimensions['D'].width = 18

        # 添加图表
        chart = BarChart()
        chart.type = "col"
        chart.style = 10
        chart.title = "月度销售情况"
        chart.y_axis.title = "销售额(元)"

        data = Reference(ws, min_col=4, min_row=start_row,
                        max_row=start_row + len(monthly_summary))
        cats = Reference(ws, min_col=1, min_row=start_row + 1,
                        max_row=start_row + len(monthly_summary))

        chart.add_data(data, titles_from_data=True)
        chart.set_categories(cats)
        chart.shape = 4

        ws.add_chart(chart, "F4")

        return monthly_summary

    def create_category_sheet(self, ws):
        """创建类别分析工作表"""
        ws.title = "按类别"

        # 类别汇总
        category_summary = self.df.groupby('类别').agg({
            '订单号': 'count',
            '数量': 'sum',
            '总金额': 'sum'
        }).reset_index()
        category_summary.columns = ['类别', '订单数', '销售数量', '总销售额']
        category_summary = category_summary.sort_values('总销售额', ascending=False)

        # 写入表格
        headers = ['类别', '订单数', '销售数量', '总销售额', '占比']
        for col, header in enumerate(headers, 1):
            cell = ws.cell(row=1, column=col, value=header)
            cell.font = self.header_font
            cell.fill = self.header_fill
            cell.border = self.border

        total_sales = category_summary['总销售额'].sum()
        for row_idx, row in category_summary.reset_index(drop=True).iterrows():
            ws.cell(row=row_idx + 2, column=1, value=row['类别'])
            ws.cell(row=row_idx + 2, column=2, value=row['订单数'])
            ws.cell(row=row_idx + 2, column=3, value=row['销售数量'])
            ws.cell(row=row_idx + 2, column=4, value=row['总销售额']).number_format = self.currency_format
            share_cell = ws.cell(row=row_idx + 2, column=5, value=row['总销售额'] / total_sales)
            share_cell.number_format = '0.0%'

            for col in range(1, 6):
                ws.cell(row=row_idx + 2, column=col).border = self.border

        # 调整列宽
        ws.column_dimensions['A'].width = 15
        ws.column_dimensions['D'].width = 18

        # 饼图
        pie = PieChart()
        pie.title = "类别销售占比"

        labels = Reference(ws, min_col=1, min_row=2, max_row=len(category_summary) + 1)
        data = Reference(ws, min_col=4, min_row=1, max_row=len(category_summary) + 1)

        pie.add_data(data, titles_from_data=True)
        pie.set_categories(labels)

        ws.add_chart(pie, "G2")

    def create_detail_sheet(self, ws):
        """创建详细数据工作表"""
        ws.title = "详细数据"

        # 将数据框写入工作表
        columns = ['订单号', '订单日期', '客户名', '产品名', '类别', '数量', '单价', '总金额']
        df_subset = self.df[columns].copy()
        df_subset['订单日期'] = df_subset['订单日期'].dt.strftime('%Y-%m-%d')

        # 表头
        for col, header in enumerate(columns, 1):
            cell = ws.cell(row=1, column=col, value=header)
            cell.font = self.header_font
            cell.fill = self.header_fill
            cell.border = self.border

        # 数据
        for row_idx, row in df_subset.iterrows():
            for col_idx, col in enumerate(columns, 1):
                cell = ws.cell(row=row_idx + 2, column=col_idx, value=row[col])
                cell.border = self.border
                if col in ['数量', '单价', '总金额']:
                    cell.number_format = self.number_format

        # 列宽
        widths = [12, 12, 12, 20, 12, 8, 12, 15]
        for col_idx, width in enumerate(widths, 1):
            ws.column_dimensions[chr(64 + col_idx)].width = width

        # 应用筛选
        ws.auto_filter.ref = f"A1:H{len(df_subset) + 1}"

    def generate_report(self):
        """生成报表主函数"""
        print("开始生成报表...")

        # 加载数据
        self.load_data()

        # 创建工作簿
        self.wb = Workbook()

        # 创建工作表
        ws_summary = self.wb.active
        self.create_summary_sheet(ws_summary)

        ws_category = self.wb.create_sheet()
        self.create_category_sheet(ws_category)

        ws_detail = self.wb.create_sheet()
        self.create_detail_sheet(ws_detail)

        # 保存文件
        today = datetime.now().strftime('%Y%m')
        output_file = os.path.join(self.output_dir, f'月度报表_{today}.xlsx')
        self.wb.save(output_file)

        print(f"报表生成完成: {output_file}")
        return output_file

# 使用示例
if __name__ == "__main__":
    # 生成示例数据(实际使用现有CSV)
    import random
    from datetime import timedelta

    # 生成示例数据
    categories = ['电子产品', '服装', '食品', '家具', '图书']
    products = {
        '电子产品': ['笔记本电脑', '智能手机', '平板电脑', '耳机'],
        '服装': ['T恤', '牛仔裤', '夹克', '大衣'],
        '食品': ['零食', '饮料', '方便面', '咖啡'],
        '家具': ['书桌', '椅子', '床', '沙发'],
        '图书': ['小说', '自助书籍', '漫画', '技术书籍']
    }

    data = []
    base_date = datetime(2025, 1, 1)

    for i in range(500):
        category = random.choice(categories)
        product = random.choice(products[category])
        qty = random.randint(1, 10)
        price = random.randint(10000, 500000)

        data.append({
            '订单号': f'ORD{i+1:05d}',
            '订单日期': base_date + timedelta(days=random.randint(0, 180)),
            '客户名': f'客户{random.randint(1, 100):03d}',
            '产品名': product,
            '类别': category,
            '数量': qty,
            '单价': price,
            '总金额': qty * price
        })

    sample_df = pd.DataFrame(data)
    sample_df.to_csv('sample_sales_data.csv', index=False, encoding='utf-8-sig')

    # 生成报表
    generator = MonthlyReportGenerator('sample_sales_data.csv')
    generator.generate_report()

总结

在本篇中,我们学习了使用Python进行Excel和CSV文件自动化的核心技术。

  • csv模块:适合简单的CSV文件读写
  • pandas:为数据分析和预处理提供强大功能
  • openpyxl:专门用于创建Excel文件、设置样式和创建图表

在实际工作中,这三者会根据情况组合使用。通常使用pandas进行简单的数据处理,最终报表则使用openpyxl进行样式美化。

下一篇第6篇将介绍邮件和通知自动化。我们将学习如何将今天创建的报表通过邮件自动发送,或通过Slack/Telegram发送通知。