Python 数据处理技巧：pandas 实战笔记

处理了几百个 Excel 文件，从每天加班到半小时搞定。pandas 用好了真能救命。

先说说背景#

公司财务每个月都要汇总各部门的报表，几十个 Excel，格式还不统一。原来都是手工复制粘贴，一个人要干三天。

我用 pandas 写了个自动化脚本，现在半小时跑完，还能自动检查错误。

基础读取#

读取单个文件#

1
import pandas as pd
2

3
# 最简单的方式
4
df = pd.read_excel('data.xlsx')
5

6
# 指定工作表
7
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
8

9
# 跳过前几行（有些报表有标题）
10
df = pd.read_excel('data.xlsx', skiprows=3)
11

12
# 指定列
13
df = pd.read_excel('data.xlsx', usecols=['A', 'C', 'D'])

批量读取#

1
import glob
2
import pandas as pd
3

4
# 读取目录下所有 xlsx
5
files = glob.glob('data/*.xlsx')
6
dfs = []
7

8
for file in files:
9
    df = pd.read_excel(file)
10
    df['来源文件'] = file  # 添加来源标记
11
    dfs.append(df)
12

13
# 合并
14
result = pd.concat(dfs, ignore_index=True)

数据清洗#

处理缺失值#

1
# 查看缺失值
2
print(df.isnull().sum())
3

4
# 删除有缺失值的行
5
df_clean = df.dropna()
6

7
# 填充缺失值
8
df['金额'].fillna(0, inplace=True)
9
df['备注'].fillna('无', inplace=True)
10

11
# 用平均值填充
12
df['金额'].fillna(df['金额'].mean(), inplace=True)

去重#

1
# 查看重复
2
print(df.duplicated().sum())
3

4
# 删除完全重复的行
5
df = df.drop_duplicates()
6

7
# 按特定列去重
8
df = df.drop_duplicates(subset=['姓名', '日期'], keep='first')

类型转换#

1
# 字符串转数字
2
df['金额'] = pd.to_numeric(df['金额'], errors='coerce')
3

4
# 字符串转日期
5
df['日期'] = pd.to_datetime(df['日期'], format='%Y-%m-%d')
6

7
# 转字符串
8
df['编号'] = df['编号'].astype(str)

errors='coerce' 很重要，转换失败会变成 NaN，不会报错。

数据处理#

筛选数据#

1
# 单条件
2
df_large = df[df['金额'] > 1000]
3

4
# 多条件
5
df_filtered = df[(df['金额'] > 1000) & (df['日期'] >= '2024-01-01')]
6

7
# isin
8
df_depts = df[df['部门'].isin(['销售部', '市场部'])]
9

10
# 字符串匹配
11
df_names = df[df['姓名'].str.contains('张', na=False)]

分组统计#

1
# 按部门汇总
2
summary = df.groupby('部门')['金额'].sum()
3

4
# 多维度统计
5
summary = df.groupby(['部门', '月份']).agg({
6
    '金额': ['sum', 'mean', 'count'],
7
    '数量': 'sum'
8
})
9

10
# 重置索引
11
summary = summary.reset_index()

数据透视表#

1
# 类似 Excel 的数据透视表
2
pivot = pd.pivot_table(
3
    df,
4
    values='金额',
5
    index='部门',
6
    columns='月份',
7
    aggfunc='sum',
8
    fill_value=0
9
)

合并数据#

1
# 类似 SQL 的 join
2
df_merged = pd.merge(
3
    df_left,
4
    df_right,
5
    on='员工ID',
6
    how='left'  # left, right, inner, outer
7
)
8

9
# 多列关联
10
df_merged = pd.merge(
11
    df_left,
12
    df_right,
13
    on=['部门', '月份'],
14
    how='inner'
15
)

实用技巧#

添加计算列#

1
# 简单计算
2
df['总额'] = df['单价'] * df['数量']
3

4
# 条件计算
5
df['折扣'] = df['金额'].apply(lambda x: 0.9 if x > 1000 else 1.0)
6

7
# 复杂逻辑
8
def calc_bonus(row):
9
    if row['业绩'] >= 100000:
10
        return row['业绩'] * 0.1
11
    elif row['业绩'] >= 50000:
12
        return row['业绩'] * 0.05
13
    else:
14
        return 0
15

16
df['奖金'] = df.apply(calc_bonus, axis=1)

数据校验#

1
# 检查异常值
2
errors = df[df['金额'] < 0]
3
if not errors.empty:
4
    print(f"发现 {len(errors)} 条金额异常数据")
5
    errors.to_excel('errors.xlsx', index=False)
6

7
# 检查必填项
8
required_cols = ['姓名', '部门', '金额']
9
for col in required_cols:
10
    missing = df[df[col].isnull()]
11
    if not missing.empty:
12
        print(f"{col} 有 {len(missing)} 条缺失")
13

14
# 数据一致性检查
15
df['校验'] = df['明细合计'] == df['总金额']
16
wrong = df[~df['校验']]

格式化输出#

1
# 设置显示选项
2
pd.set_option('display.max_columns', None)
3
pd.set_option('display.max_rows', 100)
4
pd.set_option('display.float_format', '{:.2f}'.format)
5

6
# 导出 Excel，带格式
7
with pd.ExcelWriter('output.xlsx', engine='openpyxl') as writer:
8
    df.to_excel(writer, sheet_name='数据', index=False)
9

10
    # 获取工作表
11
    worksheet = writer.sheets['数据']
12

13
    # 调整列宽
14
    for column in worksheet.columns:
15
        max_length = 0
16
        column_letter = column[0].column_letter
17
        for cell in column:
18
            try:
19
                if len(str(cell.value)) > max_length:
20
                    max_length = len(str(cell.value))
21
            except:
22
                pass
23
        adjusted_width = min(max_length + 2, 50)
24
        worksheet.column_dimensions[column_letter].width = adjusted_width

完整案例#

月度报表汇总#

1
import pandas as pd
2
import glob
3
from datetime import datetime
4

5
def process_monthly_report():
6
    """汇总各部门月度报表"""
7

8
    # 1. 读取所有文件
9
    files = glob.glob('reports/2024-*-*.xlsx')
10
    print(f"找到 {len(files)} 个文件")
11

12
    all_data = []
13
    error_files = []
14

15
    for file in files:
16
        try:
17
            # 读取数据
18
            df = pd.read_excel(file)
19

20
            # 标准化列名（有些文件列名不统一）
21
            df.columns = df.columns.str.strip()
22

23
            # 添加来源信息
24
            df['来源文件'] = file
25
            df['导入时间'] = datetime.now()
26

27
            all_data.append(df)
28

29
        except Exception as e:
30
            error_files.append({'file': file, 'error': str(e)})
31

32
    # 2. 合并数据
33
    if not all_data:
34
        print("没有成功读取的文件")
35
        return
36

37
    combined = pd.concat(all_data, ignore_index=True)
38
    print(f"合并完成，共 {len(combined)} 条记录")
39

40
    # 3. 数据清洗
41
    # 删除空行
42
    combined = combined.dropna(subset=['金额'])
43

44
    # 金额转数值
45
    combined['金额'] = pd.to_numeric(combined['金额'], errors='coerce')
46
    combined = combined.dropna(subset=['金额'])
47

48
    # 日期格式化
49
    combined['日期'] = pd.to_datetime(combined['日期'], errors='coerce')
50

51
    # 4. 数据校验
52
    # 检查负数金额
53
    negative = combined[combined['金额'] < 0]
54
    if not negative.empty:
55
        print(f"警告：发现 {len(negative)} 条负数金额")
56
        negative.to_excel('negative_amounts.xlsx', index=False)
57

58
    # 5. 生成汇总
59
    summary = combined.groupby('部门').agg({
60
        '金额': ['sum', 'mean', 'count']
61
    }).round(2)
62

63
    # 6. 导出结果
64
    with pd.ExcelWriter('monthly_summary.xlsx', engine='openpyxl') as writer:
65
        combined.to_excel(writer, sheet_name='明细数据', index=False)
66
        summary.to_excel(writer, sheet_name='部门汇总')
67

68
        if error_files:
69
            pd.DataFrame(error_files).to_excel(writer, sheet_name='错误文件', index=False)
70

71
    print("处理完成，结果已导出到 monthly_summary.xlsx")
72
    return combined, summary
73

74
# 运行
75
if __name__ == '__main__':
76
    process_monthly_report()

最后说两句#

pandas 功能很强大，但也容易把简单问题复杂化。

我的建议：

先想清楚要做什么，再写代码
小步测试，不要一次性处理太多数据
保存中间结果，方便排查问题
做好异常处理，真实数据总是乱七八糟的

还有，Excel 能做的事情不要硬上 pandas。有时候手动处理更快。

参考链接#

2026-03-09 更新。pandas 版本 2.2.x。