重复数据概述
重复数据是数据库中常见的问题,可能由于数据导入错误、应用程序逻辑缺陷或用户操作失误导致。处理重复数据是数据清洗和维护的重要环节。
🔍 识别重复数据
使用 GROUP BY 和 HAVING 子句查找重复记录
🛡️ 防止重复数据
通过约束和索引预防重复数据产生
🗑️ 删除重复数据
安全删除重复记录,保留唯一数据
🔧 数据清洗
批量处理和自动化重复数据清理
准备测试环境
-- 创建测试数据库
CREATE DATABASE duplicate_demo;
USE duplicate_demo;
-- 创建用户表(包含重复数据)
CREATE TABLE users (
id INT AUTO_INCREMENT PRIMARY KEY,
username VARCHAR(50),
email VARCHAR(100),
phone VARCHAR(20),
city VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 插入测试数据(包含重复)
INSERT INTO users (username, email, phone, city) VALUES
('alice', 'alice@example.com', '13800138001', 'Beijing'),
('bob', 'bob@example.com', '13800138002', 'Shanghai'),
('alice', 'alice@example.com', '13800138001', 'Beijing'), -- 完全重复
('charlie', 'charlie@example.com', '13800138003', 'Guangzhou'),
('bob', 'bob@gmail.com', '13800138002', 'Shanghai'), -- 部分重复
('diana', 'diana@example.com', '13800138004', 'Shenzhen'),
('alice', 'alice@hotmail.com', '13800138005', 'Beijing'), -- 用户名重复
('eve', 'eve@example.com', '13800138006', 'Hangzhou'),
('charlie', 'charlie@example.com', '13800138007', 'Guangzhou'), -- 邮箱重复
('frank', 'frank@example.com', '13800138002', 'Tianjin'); -- 手机号重复
-- 创建订单表(用于演示关联表的重复处理)
CREATE TABLE orders (
id INT AUTO_INCREMENT PRIMARY KEY,
user_id INT,
order_number VARCHAR(50),
product_name VARCHAR(100),
quantity INT,
price DECIMAL(10,2),
order_date DATE,
FOREIGN KEY (user_id) REFERENCES users(id)
);
-- 插入订单数据(包含重复)
INSERT INTO orders (user_id, order_number, product_name, quantity, price, order_date) VALUES
(1, 'ORD001', 'iPhone 15', 1, 7999.00, '2024-01-15'),
(2, 'ORD002', 'MacBook Pro', 1, 15999.00, '2024-01-16'),
(1, 'ORD001', 'iPhone 15', 1, 7999.00, '2024-01-15'), -- 重复订单
(3, 'ORD003', 'iPad Air', 1, 4999.00, '2024-01-17'),
(2, 'ORD004', 'AirPods Pro', 2, 1899.00, '2024-01-18'),
(1, 'ORD001', 'iPhone 15', 1, 7999.00, '2024-01-15'); -- 再次重复
SELECT 'Test data created successfully' AS message;
识别重复数据
基本重复检测
-- 1. 查找完全重复的记录(除了主键)
SELECT username, email, phone, city, COUNT(*) as duplicate_count
FROM users
GROUP BY username, email, phone, city
HAVING COUNT(*) > 1;
-- 2. 查找特定字段的重复
-- 查找重复的用户名
SELECT username, COUNT(*) as count
FROM users
GROUP BY username
HAVING COUNT(*) > 1;
-- 查找重复的邮箱
SELECT email, COUNT(*) as count
FROM users
GROUP BY email
HAVING COUNT(*) > 1;
-- 查找重复的手机号
SELECT phone, COUNT(*) as count
FROM users
GROUP BY phone
HAVING COUNT(*) > 1;
-- 3. 显示重复记录的详细信息
SELECT u1.*
FROM users u1
INNER JOIN (
SELECT username, email, phone, city
FROM users
GROUP BY username, email, phone, city
HAVING COUNT(*) > 1
) u2 ON u1.username = u2.username
AND u1.email = u2.email
AND u1.phone = u2.phone
AND u1.city = u2.city
ORDER BY u1.username, u1.id;
-- 4. 使用窗口函数检测重复
SELECT
id,
username,
email,
phone,
city,
ROW_NUMBER() OVER (
PARTITION BY username, email, phone, city
ORDER BY id
) as row_num
FROM users
HAVING row_num > 1;
高级重复检测
-- 1. 模糊重复检测(相似但不完全相同)
-- 检测用户名相似的记录
SELECT u1.id, u1.username, u2.id, u2.username
FROM users u1
JOIN users u2 ON u1.id < u2.id
WHERE SOUNDEX(u1.username) = SOUNDEX(u2.username)
OR LEVENSHTEIN(u1.username, u2.username) <= 2;
-- 2. 基于多个条件的复杂重复检测
SELECT
u1.*,
u2.id as duplicate_id,
CASE
WHEN u1.username = u2.username AND u1.email = u2.email THEN 'Exact Match'
WHEN u1.username = u2.username THEN 'Username Match'
WHEN u1.email = u2.email THEN 'Email Match'
WHEN u1.phone = u2.phone THEN 'Phone Match'
END as match_type
FROM users u1
JOIN users u2 ON u1.id < u2.id
WHERE (u1.username = u2.username OR u1.email = u2.email OR u1.phone = u2.phone)
ORDER BY u1.id;
-- 3. 统计重复数据报告
SELECT
'Total Records' as metric,
COUNT(*) as count
FROM users
UNION ALL
SELECT
'Unique Records (by username, email, phone, city)' as metric,
COUNT(DISTINCT CONCAT(username, email, phone, city)) as count
FROM users
UNION ALL
SELECT
'Duplicate Records' as metric,
COUNT(*) - COUNT(DISTINCT CONCAT(username, email, phone, city)) as count
FROM users
UNION ALL
SELECT
'Duplicate Username Count' as metric,
COUNT(*) - COUNT(DISTINCT username) as count
FROM users
UNION ALL
SELECT
'Duplicate Email Count' as metric,
COUNT(*) - COUNT(DISTINCT email) as count
FROM users;
-- 4. 创建重复数据检测视图
CREATE VIEW duplicate_users AS
SELECT
id,
username,
email,
phone,
city,
created_at,
ROW_NUMBER() OVER (
PARTITION BY username, email, phone, city
ORDER BY id
) as duplicate_rank,
COUNT(*) OVER (
PARTITION BY username, email, phone, city
) as duplicate_count
FROM users;
-- 查询重复数据
SELECT * FROM duplicate_users WHERE duplicate_count > 1;
防止重复数据
使用唯一约束
-- 1. 创建带唯一约束的表
CREATE TABLE users_unique (
id INT AUTO_INCREMENT PRIMARY KEY,
username VARCHAR(50) NOT NULL UNIQUE,
email VARCHAR(100) NOT NULL UNIQUE,
phone VARCHAR(20) UNIQUE,
city VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- 复合唯一约束
UNIQUE KEY uk_username_email (username, email),
UNIQUE KEY uk_phone_city (phone, city)
);
-- 2. 为现有表添加唯一约束
-- 首先清理重复数据,然后添加约束
ALTER TABLE users ADD CONSTRAINT uk_users_email UNIQUE (email);
ALTER TABLE users ADD CONSTRAINT uk_users_phone UNIQUE (phone);
ALTER TABLE users ADD CONSTRAINT uk_users_username_city UNIQUE (username, city);
-- 3. 创建唯一索引
CREATE UNIQUE INDEX idx_users_email ON users (email);
CREATE UNIQUE INDEX idx_users_username_phone ON users (username, phone);
-- 4. 测试唯一约束
-- 这些插入会失败
-- INSERT INTO users_unique (username, email, phone, city)
-- VALUES ('alice', 'alice@example.com', '13800138001', 'Beijing');
使用 INSERT IGNORE 和 ON DUPLICATE KEY UPDATE
-- 1. INSERT IGNORE - 忽略重复数据
INSERT IGNORE INTO users_unique (username, email, phone, city) VALUES
('alice', 'alice@example.com', '13800138001', 'Beijing'),
('bob', 'bob@example.com', '13800138002', 'Shanghai'),
('alice', 'alice@example.com', '13800138001', 'Beijing'); -- 会被忽略
-- 2. ON DUPLICATE KEY UPDATE - 更新重复数据
INSERT INTO users_unique (username, email, phone, city) VALUES
('alice', 'alice@example.com', '13800138001', 'Beijing')
ON DUPLICATE KEY UPDATE
city = VALUES(city),
created_at = CURRENT_TIMESTAMP;
-- 3. 批量插入时处理重复
INSERT INTO users_unique (username, email, phone, city) VALUES
('charlie', 'charlie@example.com', '13800138003', 'Guangzhou'),
('diana', 'diana@example.com', '13800138004', 'Shenzhen'),
('alice', 'alice@newmail.com', '13800138005', 'Beijing') -- 用户名重复
ON DUPLICATE KEY UPDATE
email = VALUES(email),
phone = VALUES(phone),
city = VALUES(city);
-- 4. REPLACE INTO - 替换重复数据
REPLACE INTO users_unique (username, email, phone, city) VALUES
('alice', 'alice@updated.com', '13800138001', 'Beijing');
-- 5. 使用子查询避免重复
INSERT INTO users_unique (username, email, phone, city)
SELECT 'new_user', 'new@example.com', '13800138999', 'Chengdu'
WHERE NOT EXISTS (
SELECT 1 FROM users_unique
WHERE username = 'new_user' OR email = 'new@example.com'
);
删除重复数据
使用自连接删除重复
-- 1. 删除完全重复的记录,保留 ID 最小的
DELETE u1 FROM users u1
INNER JOIN users u2
WHERE u1.id > u2.id
AND u1.username = u2.username
AND u1.email = u2.email
AND u1.phone = u2.phone
AND u1.city = u2.city;
-- 2. 删除特定字段重复的记录
-- 删除邮箱重复的记录,保留最早的
DELETE u1 FROM users u1
INNER JOIN users u2
WHERE u1.id > u2.id
AND u1.email = u2.email;
-- 3. 删除用户名重复的记录,保留最新的
DELETE u1 FROM users u1
INNER JOIN users u2
WHERE u1.id < u2.id
AND u1.username = u2.username;
-- 4. 安全删除(先备份)
-- 创建备份表
CREATE TABLE users_backup AS SELECT * FROM users;
-- 删除重复数据
DELETE u1 FROM users u1
INNER JOIN users u2
WHERE u1.id > u2.id
AND u1.username = u2.username
AND u1.email = u2.email;
-- 验证删除结果
SELECT COUNT(*) as remaining_records FROM users;
SELECT username, email, COUNT(*) as count
FROM users
GROUP BY username, email
HAVING COUNT(*) > 1;
使用窗口函数删除重复
-- 1. 使用 ROW_NUMBER() 删除重复
-- 创建临时表标记重复记录
CREATE TEMPORARY TABLE temp_duplicates AS
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY username, email, phone, city
ORDER BY id
) as row_num
FROM users;
-- 删除重复记录(保留第一条)
DELETE u FROM users u
INNER JOIN temp_duplicates t ON u.id = t.id
WHERE t.row_num > 1;
-- 2. 一步完成的删除(MySQL 8.0+)
DELETE FROM users
WHERE id NOT IN (
SELECT id FROM (
SELECT id,
ROW_NUMBER() OVER (
PARTITION BY username, email, phone, city
ORDER BY id
) as row_num
FROM users
) ranked
WHERE row_num = 1
);
-- 3. 使用 CTE 删除重复(MySQL 8.0+)
WITH duplicate_cte AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY username, email
ORDER BY created_at DESC
) as row_num
FROM users
)
DELETE u FROM users u
INNER JOIN duplicate_cte d ON u.id = d.id
WHERE d.row_num > 1;
重建表方式删除重复
-- 1. 创建去重后的新表
CREATE TABLE users_clean AS
SELECT
MIN(id) as id,
username,
email,
phone,
city,
MIN(created_at) as created_at
FROM users
GROUP BY username, email, phone, city;
-- 2. 重新设置主键
ALTER TABLE users_clean ADD PRIMARY KEY (id);
ALTER TABLE users_clean MODIFY id INT AUTO_INCREMENT;
-- 3. 替换原表
DROP TABLE users;
RENAME TABLE users_clean TO users;
-- 4. 使用 DISTINCT 创建去重表
CREATE TABLE users_distinct AS
SELECT DISTINCT username, email, phone, city, created_at
FROM users;
-- 添加自增主键
ALTER TABLE users_distinct ADD COLUMN id INT AUTO_INCREMENT PRIMARY KEY FIRST;
-- 5. 保留特定条件的记录
CREATE TABLE users_latest AS
SELECT u1.*
FROM users u1
INNER JOIN (
SELECT
username,
email,
MAX(created_at) as latest_created
FROM users
GROUP BY username, email
) u2 ON u1.username = u2.username
AND u1.email = u2.email
AND u1.created_at = u2.latest_created;
高级重复处理技术
模糊匹配去重
-- 1. 基于相似度的去重
-- 创建函数计算字符串相似度
DELIMITER //
CREATE FUNCTION LEVENSHTEIN(s1 VARCHAR(255), s2 VARCHAR(255))
RETURNS INT
READS SQL DATA
DETERMINISTIC
BEGIN
DECLARE s1_len, s2_len, i, j, c, c_temp, cost INT;
DECLARE s1_char CHAR(1);
DECLARE s2_char CHAR(1);
DECLARE cv0, cv1 VARBINARY(256);
SET s1_len = CHAR_LENGTH(s1);
SET s2_len = CHAR_LENGTH(s2);
SET cv1 = 0x00;
SET j = 1;
SET i = 1;
WHILE j <= s2_len DO
SET cv1 = CONCAT(cv1, UNHEX(HEX(j)));
SET j = j + 1;
END WHILE;
WHILE i <= s1_len DO
SET s1_char = SUBSTRING(s1, i, 1);
SET c = i;
SET cv0 = UNHEX(HEX(i));
SET j = 1;
WHILE j <= s2_len DO
SET s2_char = SUBSTRING(s2, j, 1);
SET c = c + 1;
IF s1_char = s2_char THEN
SET cost = 0;
ELSE
SET cost = 1;
END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv1, j, 1)), 16, 10) + cost;
IF c > c_temp THEN
SET c = c_temp;
END IF;
SET c_temp = CONV(HEX(SUBSTRING(cv0, j, 1)), 16, 10) + 1;
IF c > c_temp THEN
SET c = c_temp;
END IF;
SET cv0 = CONCAT(cv0, UNHEX(HEX(c)));
SET j = j + 1;
END WHILE;
SET cv1 = cv0;
SET i = i + 1;
END WHILE;
RETURN c;
END //
DELIMITER ;
-- 2. 查找相似的用户名
SELECT
u1.id,
u1.username,
u2.id as similar_id,
u2.username as similar_username,
LEVENSHTEIN(u1.username, u2.username) as distance
FROM users u1
JOIN users u2 ON u1.id < u2.id
WHERE LEVENSHTEIN(u1.username, u2.username) <= 2
AND u1.username != u2.username;
-- 3. 基于 SOUNDEX 的相似匹配
SELECT
u1.id,
u1.username,
u2.id as similar_id,
u2.username as similar_username,
SOUNDEX(u1.username) as soundex1,
SOUNDEX(u2.username) as soundex2
FROM users u1
JOIN users u2 ON u1.id < u2.id
WHERE SOUNDEX(u1.username) = SOUNDEX(u2.username)
AND u1.username != u2.username;
批量去重存储过程
-- 创建批量去重存储过程
DELIMITER //
CREATE PROCEDURE RemoveDuplicates(
IN table_name VARCHAR(64),
IN duplicate_columns TEXT,
IN keep_rule ENUM('first', 'last', 'min_id', 'max_id') DEFAULT 'first'
)
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE sql_stmt TEXT;
DECLARE backup_table VARCHAR(64);
-- 创建备份表名
SET backup_table = CONCAT(table_name, '_backup_', DATE_FORMAT(NOW(), '%Y%m%d_%H%i%s'));
-- 创建备份
SET sql_stmt = CONCAT('CREATE TABLE ', backup_table, ' AS SELECT * FROM ', table_name);
SET @sql = sql_stmt;
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 根据保留规则删除重复数据
CASE keep_rule
WHEN 'first' THEN
SET sql_stmt = CONCAT(
'DELETE t1 FROM ', table_name, ' t1 ',
'INNER JOIN ', table_name, ' t2 ',
'WHERE t1.id > t2.id AND ',
duplicate_columns
);
WHEN 'last' THEN
SET sql_stmt = CONCAT(
'DELETE t1 FROM ', table_name, ' t1 ',
'INNER JOIN ', table_name, ' t2 ',
'WHERE t1.id < t2.id AND ',
duplicate_columns
);
WHEN 'min_id' THEN
SET sql_stmt = CONCAT(
'DELETE FROM ', table_name, ' WHERE id NOT IN (',
'SELECT min_id FROM (',
'SELECT MIN(id) as min_id FROM ', table_name, ' GROUP BY ', duplicate_columns,
') as temp)'
);
WHEN 'max_id' THEN
SET sql_stmt = CONCAT(
'DELETE FROM ', table_name, ' WHERE id NOT IN (',
'SELECT max_id FROM (',
'SELECT MAX(id) as max_id FROM ', table_name, ' GROUP BY ', duplicate_columns,
') as temp)'
);
END CASE;
-- 执行删除
SET @sql = sql_stmt;
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 返回结果
SELECT
CONCAT('Backup created: ', backup_table) as message,
ROW_COUNT() as deleted_rows;
END //
DELIMITER ;
-- 使用存储过程
-- CALL RemoveDuplicates('users', 't1.username = t2.username AND t1.email = t2.email', 'first');
实时重复检测触发器
-- 创建重复检测日志表
CREATE TABLE duplicate_log (
id INT AUTO_INCREMENT PRIMARY KEY,
table_name VARCHAR(64),
duplicate_type VARCHAR(50),
original_id INT,
duplicate_data JSON,
detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 创建触发器检测重复插入
DELIMITER //
CREATE TRIGGER check_duplicate_insert
BEFORE INSERT ON users
FOR EACH ROW
BEGIN
DECLARE duplicate_count INT DEFAULT 0;
DECLARE duplicate_id INT DEFAULT 0;
-- 检查是否存在重复的用户名和邮箱
SELECT COUNT(*), MIN(id) INTO duplicate_count, duplicate_id
FROM users
WHERE username = NEW.username AND email = NEW.email;
-- 如果发现重复,记录日志
IF duplicate_count > 0 THEN
INSERT INTO duplicate_log (table_name, duplicate_type, original_id, duplicate_data)
VALUES (
'users',
'username_email_duplicate',
duplicate_id,
JSON_OBJECT(
'username', NEW.username,
'email', NEW.email,
'phone', NEW.phone,
'city', NEW.city
)
);
-- 可以选择阻止插入或修改数据
-- SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'Duplicate username and email combination';
-- 或者修改数据以避免重复
SET NEW.username = CONCAT(NEW.username, '_', UNIX_TIMESTAMP());
END IF;
END //
DELIMITER ;
-- 创建更新触发器
DELIMITER //
CREATE TRIGGER check_duplicate_update
BEFORE UPDATE ON users
FOR EACH ROW
BEGIN
DECLARE duplicate_count INT DEFAULT 0;
-- 检查更新后是否会产生重复
SELECT COUNT(*) INTO duplicate_count
FROM users
WHERE username = NEW.username
AND email = NEW.email
AND id != NEW.id;
IF duplicate_count > 0 THEN
INSERT INTO duplicate_log (table_name, duplicate_type, original_id, duplicate_data)
VALUES (
'users',
'update_would_create_duplicate',
NEW.id,
JSON_OBJECT(
'old_username', OLD.username,
'new_username', NEW.username,
'old_email', OLD.email,
'new_email', NEW.email
)
);
-- 阻止更新
SIGNAL SQLSTATE '45000' SET MESSAGE_TEXT = 'Update would create duplicate record';
END IF;
END //
DELIMITER ;
-- 查询重复检测日志
SELECT
table_name,
duplicate_type,
COUNT(*) as occurrence_count,
MAX(detected_at) as last_detected
FROM duplicate_log
GROUP BY table_name, duplicate_type
ORDER BY last_detected DESC;
处理策略对比
方法 | 适用场景 | 优点 | 缺点 | 性能 |
---|---|---|---|---|
唯一约束 | 预防重复数据 | 完全防止重复,性能好 | 需要提前设计,难以修改 | 高 |
INSERT IGNORE | 批量导入时忽略重复 | 简单易用,不会报错 | 静默忽略,可能丢失数据 | 高 |
ON DUPLICATE KEY UPDATE | 更新重复数据 | 灵活处理重复,可自定义逻辑 | 语法复杂,需要唯一键 | 中等 |
自连接删除 | 清理现有重复数据 | 直接有效,逻辑清晰 | 大表性能差,需要小心操作 | 低 |
窗口函数 | 复杂重复检测和删除 | 功能强大,支持复杂逻辑 | 需要 MySQL 8.0+ | 中等 |
重建表 | 大量重复数据清理 | 彻底清理,性能好 | 需要停机时间,风险高 | 高 |
最佳实践和注意事项
🛡️ 预防为主
设计阶段就考虑唯一性约束,从源头防止重复数据
💾 备份先行
删除重复数据前务必创建备份,确保数据安全
🔍 分步验证
先查询确认重复数据,再执行删除操作
⚡ 性能考虑
大表操作时考虑分批处理,避免长时间锁表
📊 监控日志
建立重复数据监控机制,及时发现问题
🔄 定期清理
建立定期的数据清理流程,保持数据质量
📋 操作检查清单
- 操作前:备份数据、确认重复规则、测试查询语句
- 操作中:监控执行进度、检查锁等待、观察系统负载
- 操作后:验证数据完整性、检查约束、更新统计信息
- 长期:建立监控机制、定期数据质量检查、优化预防措施
⚠️ 常见陷阱
- 外键约束:删除重复数据时注意外键关系,可能需要级联处理
- 事务处理:大批量删除时合理使用事务,避免长时间锁定
- 索引影响:删除操作可能影响索引性能,考虑重建索引
- 应用影响:确保应用程序能正确处理重复数据的各种情况
- 数据类型:注意 NULL 值的比较,NULL != NULL
总结
处理重复数据是数据库维护的重要环节,需要根据具体场景选择合适的策略。预防重复数据的产生比事后清理更加重要和高效。
🎯 关键要点
- 设计阶段:合理设置唯一约束和索引
- 开发阶段:使用适当的插入语句处理重复
- 维护阶段:定期检测和清理重复数据
- 监控阶段:建立重复数据监控和报警机制
掌握这些重复数据处理技术,可以有效提高数据质量,确保数据库的一致性和可靠性。