| 查询全部数据 |
SELECT * FROM table; |
df 或 df.head() |
Pandas DataFrame 默认显示全部(或用 .head() 查看前几行) |
| 选择特定列 |
SELECT col1, col2 FROM table; |
df[['col1', 'col2']] |
单列用 df['col'],多列需用双层方括号 |
| 条件筛选(WHERE) |
SELECT * FROM table WHERE age > 30; |
df[df['age'] > 30] 或 df.query('age > 30') |
支持布尔索引或 query 字符串表达式 |
| 多条件筛选 |
SELECT * FROM table WHERE age > 30 AND city = '北京'; |
df[(df['age'] > 30) & (df['city'] == '北京')] 或 df.query("age > 30 and city == '北京'") |
注意:Pandas 中逻辑运算符为 &、 |
| 排序(ORDER BY) |
SELECT * FROM table ORDER BY salary DESC; |
df.sort_values('salary', ascending=False) |
多列排序:sort_values(['col1','col2'], ascending=[True,False]) |
| 分组统计(GROUP BY) |
SELECT dept, AVG(salary) FROM table GROUP BY dept; |
df.groupby('dept')['salary'].mean() 或 df.groupby('dept').agg({'salary': 'mean'}) |
可对分组结果应用 sum、count、max 等聚合函数 |
| 总行数(COUNT ) |
SELECT COUNT() FROM table; |
len(df) 或 df.shape[0] |
返回 DataFrame 的行数 |
| 分组计数 |
SELECT dept, COUNT(*) FROM table GROUP BY dept; |
df.groupby('dept').size() |
size() 包含 NaN;若用 count() 则忽略 NaN |
| 去重(DISTINCT) |
SELECT DISTINCT city FROM table; |
df['city'].unique() 或 df[['city']].drop_duplicates() |
unique() 返回数组,drop_duplicates() 返回 DataFrame |
| 限制结果数量(LIMIT) |
SELECT * FROM table LIMIT 5; |
df.head(5) |
取前 N 行;tail(N) 取后 N 行 |
| 筛选空值(IS NULL) |
SELECT * FROM table WHERE name IS NULL; |
df[df['name'].isnull()] |
isnull() 与 isna() 等价 |
| 筛选非空值(IS NOT NULL) |
SELECT * FROM table WHERE name IS NOT NULL; |
df[df['name'].notnull()] |
notnull() 与 notna() 等价 |
| 字符串模糊匹配(LIKE) |
SELECT * FROM table WHERE name LIKE '%明%'; |
df[df['name'].str.contains('明')] |
需确保列是字符串类型,否则会报错 |
| 列重命名(AS) |
SELECT name AS 姓名 FROM table; |
df.rename(columns={'name': '姓名'}) |
可同时重命名多个列 |
| 新增计算列 |
SELECT *, salary * 12 AS annual FROM table; |
df['annual'] = df['salary'] * 12 |
直接赋值即可创建新列 |
| 内连接(INNER JOIN) |
SELECT * FROM A INNER JOIN B ON A.id = B.id; |
pd.merge(dfA, dfB, on='id', how='inner') |
how 参数可选 'left', 'right', 'outer' |
| 垂直拼接(UNION ALL) |
SELECT * FROM A UNION ALL SELECT * FROM B; |
pd.concat([dfA, dfB], ignore_index=True) |
如需去重(UNION),再加 .drop_duplicates() |
| 求最大值 |
SELECT MAX(salary) FROM table; |
df['salary'].max() |
同理可用 min(), sum(), mean() 等 |
| 分页查询(LIMIT OFFSET) |
SELECT * FROM table LIMIT 10 OFFSET 20; |
df.iloc[20:30] |
iloc 按位置切片,左闭右开 |
| 删除重复行 |
DELETE ...(通常先查后删) |
df.drop_duplicates() |
默认保留首次出现的重复行 |