GROUP BY 简介
GROUP BY 语句用于将查询结果按一个或多个列进行分组,通常与聚合函数(如 COUNT、SUM、AVG、MAX、MIN)一起使用,对每个分组进行统计计算。
GROUP BY 的特点:
- 将数据按指定列分组
- 每个分组返回一行结果
- 通常与聚合函数配合使用
- 可以按多个列进行分组
- 支持 HAVING 子句过滤分组
基本语法
基本 GROUP BY 语法
SELECT column1, aggregate_function(column2)
FROM table_name
WHERE condition
GROUP BY column1;
多列分组
SELECT column1, column2, aggregate_function(column3)
FROM table_name
GROUP BY column1, column2;
带 HAVING 子句
SELECT column1, aggregate_function(column2)
FROM table_name
GROUP BY column1
HAVING aggregate_function(column2) > value;
聚合函数
函数 | 描述 | 示例 |
---|---|---|
COUNT() | 计算行数 | COUNT(*), COUNT(column) |
SUM() | 计算总和 | SUM(salary) |
AVG() | 计算平均值 | AVG(age) |
MAX() | 找最大值 | MAX(score) |
MIN() | 找最小值 | MIN(price) |
GROUP_CONCAT() | 连接字符串 | GROUP_CONCAT(name) |
实际示例
示例数据准备
-- 创建销售表
CREATE TABLE sales (
id INT PRIMARY KEY,
salesperson VARCHAR(50),
product VARCHAR(50),
region VARCHAR(50),
amount DECIMAL(10,2),
sale_date DATE
);
-- 插入测试数据
INSERT INTO sales VALUES
(1, '张三', '笔记本', '北京', 8000.00, '2023-01-15'),
(2, '李四', '手机', '上海', 3000.00, '2023-01-20'),
(3, '张三', '平板', '北京', 2500.00, '2023-02-10'),
(4, '王五', '笔记本', '广州', 8500.00, '2023-02-15'),
(5, '李四', '手机', '上海', 3200.00, '2023-03-05'),
(6, '张三', '手机', '北京', 2800.00, '2023-03-10'),
(7, '王五', '平板', '广州', 2600.00, '2023-03-20'),
(8, '赵六', '笔记本', '深圳', 9000.00, '2023-04-01');
基本分组查询
按销售员分组统计
-- 统计每个销售员的销售总额
SELECT salesperson,
COUNT(*) as order_count,
SUM(amount) as total_sales,
AVG(amount) as avg_sales
FROM sales
GROUP BY salesperson;
salesperson | order_count | total_sales | avg_sales |
---|---|---|---|
张三 | 3 | 13300.00 | 4433.33 |
李四 | 2 | 6200.00 | 3100.00 |
王五 | 2 | 11100.00 | 5550.00 |
赵六 | 1 | 9000.00 | 9000.00 |
按地区分组统计
-- 统计各地区的销售情况
SELECT region,
COUNT(*) as order_count,
SUM(amount) as total_sales,
MAX(amount) as max_sale,
MIN(amount) as min_sale
FROM sales
GROUP BY region
ORDER BY total_sales DESC;
按产品分组统计
-- 统计各产品的销售情况
SELECT product,
COUNT(*) as sales_count,
SUM(amount) as total_revenue,
AVG(amount) as avg_price
FROM sales
GROUP BY product;
多列分组
按销售员和地区分组
-- 统计每个销售员在各地区的销售情况
SELECT salesperson, region,
COUNT(*) as order_count,
SUM(amount) as total_sales
FROM sales
GROUP BY salesperson, region
ORDER BY salesperson, region;
按年月分组
-- 按月统计销售情况
SELECT YEAR(sale_date) as year,
MONTH(sale_date) as month,
COUNT(*) as order_count,
SUM(amount) as monthly_sales
FROM sales
GROUP BY YEAR(sale_date), MONTH(sale_date)
ORDER BY year, month;
按产品和地区分组
-- 统计各产品在不同地区的销售情况
SELECT product, region,
COUNT(*) as sales_count,
SUM(amount) as total_revenue,
AVG(amount) as avg_price
FROM sales
GROUP BY product, region
ORDER BY product, total_revenue DESC;
HAVING 子句
HAVING 子句用于过滤分组后的结果,类似于 WHERE 子句,但 HAVING 是在分组之后进行过滤。
基本 HAVING 用法
-- 查询销售总额大于10000的销售员
SELECT salesperson, SUM(amount) as total_sales
FROM sales
GROUP BY salesperson
HAVING SUM(amount) > 10000;
多个 HAVING 条件
-- 查询订单数量大于1且平均销售额大于4000的销售员
SELECT salesperson,
COUNT(*) as order_count,
AVG(amount) as avg_sales
FROM sales
GROUP BY salesperson
HAVING COUNT(*) > 1 AND AVG(amount) > 4000;
HAVING 与 WHERE 的区别
-- WHERE 在分组前过滤,HAVING 在分组后过滤
SELECT region, COUNT(*) as order_count, SUM(amount) as total_sales
FROM sales
WHERE amount > 3000 -- 分组前过滤:只考虑金额大于3000的订单
GROUP BY region
HAVING COUNT(*) >= 2; -- 分组后过滤:只显示订单数>=2的地区
高级分组技巧
使用 GROUP_CONCAT
-- 将每个销售员的产品列表连接起来
SELECT salesperson,
GROUP_CONCAT(product) as products,
GROUP_CONCAT(DISTINCT product) as unique_products,
COUNT(*) as order_count
FROM sales
GROUP BY salesperson;
条件聚合
-- 使用 CASE 语句进行条件聚合
SELECT region,
COUNT(*) as total_orders,
SUM(CASE WHEN amount > 5000 THEN 1 ELSE 0 END) as high_value_orders,
SUM(CASE WHEN amount > 5000 THEN amount ELSE 0 END) as high_value_sales
FROM sales
GROUP BY region;
分组排名
-- 使用窗口函数进行分组排名
SELECT salesperson, region, amount,
ROW_NUMBER() OVER (PARTITION BY region ORDER BY amount DESC) as rank_in_region
FROM sales
ORDER BY region, rank_in_region;
分组百分比
-- 计算各地区销售额占总销售额的百分比
SELECT region,
SUM(amount) as region_sales,
ROUND(SUM(amount) * 100.0 / (SELECT SUM(amount) FROM sales), 2) as percentage
FROM sales
GROUP BY region
ORDER BY region_sales DESC;
WITH ROLLUP
WITH ROLLUP 用于在分组查询中生成小计和总计行。
单列 ROLLUP
-- 按地区分组并生成总计
SELECT region, SUM(amount) as total_sales
FROM sales
GROUP BY region WITH ROLLUP;
多列 ROLLUP
-- 按地区和产品分组并生成各级小计
SELECT region, product, SUM(amount) as total_sales
FROM sales
GROUP BY region, product WITH ROLLUP
ORDER BY region, product;
处理 ROLLUP 的 NULL 值
-- 使用 IFNULL 处理 ROLLUP 产生的 NULL 值
SELECT
IFNULL(region, '总计') as region,
IFNULL(product, '小计') as product,
SUM(amount) as total_sales
FROM sales
GROUP BY region, product WITH ROLLUP;
性能优化
GROUP BY 性能优化技巧:
- 在分组列上创建索引
- 使用覆盖索引避免回表
- 合理使用 WHERE 子句减少分组数据量
- 避免在大表上进行复杂分组
- 考虑使用分区表
索引优化示例
-- 为分组列创建索引
CREATE INDEX idx_region ON sales(region);
CREATE INDEX idx_salesperson ON sales(salesperson);
CREATE INDEX idx_region_product ON sales(region, product);
-- 覆盖索引优化
CREATE INDEX idx_cover ON sales(region, salesperson, amount);
查看执行计划
-- 分析分组查询的执行计划
EXPLAIN SELECT region, COUNT(*), SUM(amount)
FROM sales
GROUP BY region;
实际应用场景
1. 销售报表
-- 月度销售报表
SELECT
DATE_FORMAT(sale_date, '%Y-%m') as month,
COUNT(*) as order_count,
SUM(amount) as total_sales,
AVG(amount) as avg_order_value,
COUNT(DISTINCT salesperson) as active_salespeople
FROM sales
GROUP BY DATE_FORMAT(sale_date, '%Y-%m')
ORDER BY month;
2. 用户行为分析
-- 用户活跃度分析
SELECT
user_id,
COUNT(*) as login_count,
MIN(login_time) as first_login,
MAX(login_time) as last_login,
DATEDIFF(MAX(login_time), MIN(login_time)) as active_days
FROM user_logs
GROUP BY user_id
HAVING login_count >= 5
ORDER BY login_count DESC;
3. 库存统计
-- 按类别统计库存
SELECT
category,
COUNT(*) as product_count,
SUM(stock_quantity) as total_stock,
AVG(price) as avg_price,
SUM(stock_quantity * price) as total_value
FROM products
GROUP BY category
ORDER BY total_value DESC;
常见错误和注意事项
常见错误:
- SELECT 中包含非分组列且不是聚合函数
- 混淆 WHERE 和 HAVING 的使用场景
- 在 ORDER BY 中使用未分组的列
- 忽略 NULL 值对分组的影响
SQL_MODE 设置
-- 查看当前 SQL 模式
SELECT @@sql_mode;
-- 设置严格模式(推荐)
SET sql_mode = 'STRICT_TRANS_TABLES,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION';
正确的分组查询
-- 错误:SELECT 中包含非分组列
SELECT region, salesperson, SUM(amount) -- 错误
FROM sales
GROUP BY region;
-- 正确:只选择分组列和聚合函数
SELECT region, SUM(amount)
FROM sales
GROUP BY region;
-- 或者:将所有非聚合列都加入 GROUP BY
SELECT region, salesperson, SUM(amount)
FROM sales
GROUP BY region, salesperson;