重复数据概述
重复数据是数据库中常见的问题,可能由于数据导入错误、应用程序逻辑缺陷、并发插入等原因产生。有效处理重复数据对于维护数据质量和系统性能至关重要。
🔍 识别重复
使用 GROUP BY 和聚合函数找出重复记录
🚫 防止重复
通过约束和索引预防重复数据产生
🗑️ 删除重复
使用各种技术删除已存在的重复记录
🔄 处理重复
在插入时智能处理重复数据
重复数据的常见原因:
- 数据导入时缺乏去重机制
- 应用程序的重复提交
- 并发操作导致的竞态条件
- 数据同步过程中的错误
- 缺乏适当的唯一性约束
- ETL 过程中的数据重复
识别重复数据
准备示例数据
-- 创建示例表
CREATE TABLE customers (
id INT AUTO_INCREMENT PRIMARY KEY,
first_name VARCHAR(50),
last_name VARCHAR(50),
email VARCHAR(100),
phone VARCHAR(20),
city VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- 插入包含重复数据的示例
INSERT INTO customers (first_name, last_name, email, phone, city) VALUES
('John', 'Doe', 'john.doe@email.com', '123-456-7890', 'New York'),
('Jane', 'Smith', 'jane.smith@email.com', '234-567-8901', 'Los Angeles'),
('John', 'Doe', 'john.doe@email.com', '123-456-7890', 'New York'), -- 完全重复
('Jane', 'Smith', 'jane.smith@gmail.com', '234-567-8901', 'Los Angeles'), -- 邮箱不同
('Bob', 'Johnson', 'bob.johnson@email.com', '345-678-9012', 'Chicago'),
('John', 'Doe', 'john.d@email.com', '123-456-7890', 'New York'), -- 邮箱不同
('Alice', 'Brown', 'alice.brown@email.com', '456-789-0123', 'Houston'),
('Bob', 'Johnson', 'bob.johnson@email.com', '345-678-9012', 'Chicago'), -- 完全重复
('Charlie', 'Wilson', 'charlie.wilson@email.com', '567-890-1234', 'Phoenix'),
('Alice', 'Brown', 'alice.b@email.com', '456-789-0123', 'Houston'); -- 邮箱不同
-- 查看所有数据
SELECT * FROM customers ORDER BY first_name, last_name;
基本重复检测
-- 查找完全重复的记录(除了ID)
SELECT
first_name,
last_name,
email,
phone,
city,
COUNT(*) as duplicate_count
FROM customers
GROUP BY first_name, last_name, email, phone, city
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC;
-- 查找基于姓名的重复
SELECT
first_name,
last_name,
COUNT(*) as name_duplicates,
GROUP_CONCAT(DISTINCT email ORDER BY email) as emails,
GROUP_CONCAT(DISTINCT phone ORDER BY phone) as phones
FROM customers
GROUP BY first_name, last_name
HAVING COUNT(*) > 1
ORDER BY name_duplicates DESC;
-- 查找基于邮箱的重复
SELECT
email,
COUNT(*) as email_duplicates,
GROUP_CONCAT(CONCAT(first_name, ' ', last_name) ORDER BY id) as names
FROM customers
GROUP BY email
HAVING COUNT(*) > 1;
-- 查找基于电话的重复
SELECT
phone,
COUNT(*) as phone_duplicates,
GROUP_CONCAT(CONCAT(first_name, ' ', last_name) ORDER BY id) as names
FROM customers
GROUP BY phone
HAVING COUNT(*) > 1;
高级重复检测
-- 使用窗口函数标识重复记录
SELECT
id,
first_name,
last_name,
email,
phone,
ROW_NUMBER() OVER (
PARTITION BY first_name, last_name, email, phone, city
ORDER BY id
) as row_num,
COUNT(*) OVER (
PARTITION BY first_name, last_name, email, phone, city
) as total_duplicates
FROM customers
ORDER BY first_name, last_name, id;
-- 查找除了最早记录外的所有重复记录
WITH duplicate_analysis AS (
SELECT
id,
first_name,
last_name,
email,
phone,
city,
ROW_NUMBER() OVER (
PARTITION BY first_name, last_name, email, phone, city
ORDER BY id
) as row_num
FROM customers
)
SELECT
id,
first_name,
last_name,
email,
'DUPLICATE' as status
FROM duplicate_analysis
WHERE row_num > 1
ORDER BY first_name, last_name;
-- 模糊匹配检测(相似但不完全相同的记录)
SELECT
c1.id as id1,
c2.id as id2,
c1.first_name,
c1.last_name,
c1.email as email1,
c2.email as email2,
c1.phone,
'SIMILAR_RECORDS' as match_type
FROM customers c1
JOIN customers c2 ON c1.id < c2.id
WHERE c1.first_name = c2.first_name
AND c1.last_name = c2.last_name
AND c1.phone = c2.phone
AND c1.email != c2.email -- 邮箱不同但其他信息相同
ORDER BY c1.first_name, c1.last_name;
重复数据统计报告
-- 生成重复数据统计报告
SELECT
'Total Records' as metric,
COUNT(*) as count
FROM customers
UNION ALL
SELECT
'Unique Records (by name+email+phone)',
COUNT(*)
FROM (
SELECT DISTINCT first_name, last_name, email, phone, city
FROM customers
) as unique_records
UNION ALL
SELECT
'Duplicate Records',
COUNT(*) - (
SELECT COUNT(*) FROM (
SELECT DISTINCT first_name, last_name, email, phone, city
FROM customers
) as unique_records
)
FROM customers
UNION ALL
SELECT
'Duplicate Groups',
COUNT(*)
FROM (
SELECT first_name, last_name, email, phone, city
FROM customers
GROUP BY first_name, last_name, email, phone, city
HAVING COUNT(*) > 1
) as duplicate_groups;
-- 按重复类型分类统计
SELECT
'Name Duplicates' as duplicate_type,
COUNT(*) as groups,
SUM(duplicate_count - 1) as extra_records
FROM (
SELECT COUNT(*) as duplicate_count
FROM customers
GROUP BY first_name, last_name
HAVING COUNT(*) > 1
) as name_dups
UNION ALL
SELECT
'Email Duplicates',
COUNT(*),
SUM(duplicate_count - 1)
FROM (
SELECT COUNT(*) as duplicate_count
FROM customers
GROUP BY email
HAVING COUNT(*) > 1
) as email_dups
UNION ALL
SELECT
'Phone Duplicates',
COUNT(*),
SUM(duplicate_count - 1)
FROM (
SELECT COUNT(*) as duplicate_count
FROM customers
GROUP BY phone
HAVING COUNT(*) > 1
) as phone_dups;
防止重复数据
使用唯一约束
-- 创建带有唯一约束的表
CREATE TABLE users_unique (
id INT AUTO_INCREMENT PRIMARY KEY,
username VARCHAR(50) NOT NULL UNIQUE,
email VARCHAR(100) NOT NULL UNIQUE,
phone VARCHAR(20),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- 复合唯一约束
UNIQUE KEY uk_phone_email (phone, email)
);
-- 为现有表添加唯一约束
CREATE TABLE customers_clean AS
SELECT DISTINCT first_name, last_name, email, phone, city
FROM customers;
ALTER TABLE customers_clean ADD COLUMN id INT AUTO_INCREMENT PRIMARY KEY FIRST;
ALTER TABLE customers_clean ADD UNIQUE KEY uk_email (email);
ALTER TABLE customers_clean ADD UNIQUE KEY uk_phone (phone);
ALTER TABLE customers_clean ADD UNIQUE KEY uk_name_phone (first_name, last_name, phone);
-- 测试唯一约束
INSERT INTO users_unique (username, email, phone) VALUES
('john_doe', 'john@example.com', '123-456-7890');
-- 这将失败,因为邮箱重复
-- INSERT INTO users_unique (username, email, phone) VALUES
-- ('jane_doe', 'john@example.com', '234-567-8901');
-- 查看约束信息
SELECT
CONSTRAINT_NAME,
COLUMN_NAME,
CONSTRAINT_TYPE
FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
WHERE TABLE_SCHEMA = DATABASE()
AND TABLE_NAME = 'users_unique'
AND CONSTRAINT_NAME != 'PRIMARY'
ORDER BY CONSTRAINT_NAME;
使用 INSERT IGNORE
-- INSERT IGNORE 会忽略重复键错误
INSERT IGNORE INTO users_unique (username, email, phone) VALUES
('john_doe2', 'john@example.com', '123-456-7890'), -- 邮箱重复,会被忽略
('jane_smith', 'jane@example.com', '234-567-8901'), -- 新记录,会插入
('bob_wilson', 'john@example.com', '345-678-9012'); -- 邮箱重复,会被忽略
-- 查看结果
SELECT * FROM users_unique;
-- 查看受影响的行数
SELECT ROW_COUNT() as affected_rows;
-- 批量插入时使用 INSERT IGNORE
INSERT IGNORE INTO customers_clean (first_name, last_name, email, phone, city)
SELECT first_name, last_name, email, phone, city
FROM customers;
-- 查看插入结果
SELECT COUNT(*) as total_inserted FROM customers_clean;
使用 ON DUPLICATE KEY UPDATE
-- 创建带有更新时间的表
CREATE TABLE products (
id INT AUTO_INCREMENT PRIMARY KEY,
product_code VARCHAR(50) UNIQUE,
product_name VARCHAR(100),
price DECIMAL(10,2),
stock_quantity INT DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
);
-- 使用 ON DUPLICATE KEY UPDATE
INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES
('PROD001', 'Product A', 99.99, 100),
('PROD002', 'Product B', 149.99, 50),
('PROD003', 'Product C', 199.99, 25)
ON DUPLICATE KEY UPDATE
product_name = VALUES(product_name),
price = VALUES(price),
stock_quantity = stock_quantity + VALUES(stock_quantity),
updated_at = CURRENT_TIMESTAMP;
-- 再次插入相同的产品代码(会更新而不是插入)
INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES
('PROD001', 'Updated Product A', 109.99, 50),
('PROD004', 'Product D', 79.99, 75)
ON DUPLICATE KEY UPDATE
product_name = VALUES(product_name),
price = VALUES(price),
stock_quantity = stock_quantity + VALUES(stock_quantity),
updated_at = CURRENT_TIMESTAMP;
-- 查看结果
SELECT * FROM products ORDER BY product_code;
-- 更复杂的 ON DUPLICATE KEY UPDATE 示例
INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES
('PROD001', 'Product A v2', 119.99, 25)
ON DUPLICATE KEY UPDATE
product_name = CASE
WHEN VALUES(price) > price THEN VALUES(product_name)
ELSE product_name
END,
price = GREATEST(price, VALUES(price)),
stock_quantity = stock_quantity + VALUES(stock_quantity),
updated_at = CURRENT_TIMESTAMP;
使用 REPLACE INTO
-- REPLACE INTO 会删除重复记录然后插入新记录
REPLACE INTO products (product_code, product_name, price, stock_quantity) VALUES
('PROD001', 'Replaced Product A', 129.99, 200),
('PROD005', 'Product E', 59.99, 150);
-- 查看结果(注意 PROD001 的 ID 可能会改变)
SELECT * FROM products ORDER BY product_code;
-- 批量 REPLACE
CREATE TEMPORARY TABLE temp_products (
product_code VARCHAR(50),
product_name VARCHAR(100),
price DECIMAL(10,2),
stock_quantity INT
);
INSERT INTO temp_products VALUES
('PROD001', 'Final Product A', 139.99, 300),
('PROD002', 'Final Product B', 159.99, 100),
('PROD006', 'Product F', 89.99, 80);
REPLACE INTO products (product_code, product_name, price, stock_quantity)
SELECT product_code, product_name, price, stock_quantity
FROM temp_products;
-- 查看最终结果
SELECT * FROM products ORDER BY product_code;
删除重复数据
使用自连接删除重复
-- 创建包含重复数据的测试表
CREATE TABLE test_duplicates AS
SELECT * FROM customers;
-- 查看重复数据
SELECT
first_name, last_name, email, phone, city,
COUNT(*) as count
FROM test_duplicates
GROUP BY first_name, last_name, email, phone, city
HAVING COUNT(*) > 1;
-- 方法1:使用自连接删除重复(保留ID最小的记录)
DELETE t1 FROM test_duplicates t1
INNER JOIN test_duplicates t2
WHERE t1.id > t2.id
AND t1.first_name = t2.first_name
AND t1.last_name = t2.last_name
AND t1.email = t2.email
AND t1.phone = t2.phone
AND t1.city = t2.city;
-- 查看删除结果
SELECT COUNT(*) as remaining_records FROM test_duplicates;
SELECT * FROM test_duplicates ORDER BY first_name, last_name;
使用窗口函数删除重复
-- 重新创建测试数据
DROP TABLE test_duplicates;
CREATE TABLE test_duplicates AS
SELECT * FROM customers;
-- 方法2:使用窗口函数和临时表
CREATE TEMPORARY TABLE temp_keep_ids AS
SELECT id
FROM (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY first_name, last_name, email, phone, city
ORDER BY id
) as row_num
FROM test_duplicates
) ranked
WHERE row_num = 1;
-- 删除不在保留列表中的记录
DELETE FROM test_duplicates
WHERE id NOT IN (SELECT id FROM temp_keep_ids);
-- 查看结果
SELECT COUNT(*) as remaining_records FROM test_duplicates;
-- 验证没有重复
SELECT
first_name, last_name, email, phone, city,
COUNT(*) as count
FROM test_duplicates
GROUP BY first_name, last_name, email, phone, city
HAVING COUNT(*) > 1;
使用 GROUP BY 重建表
-- 方法3:重建表(最安全的方法)
-- 备份原表
CREATE TABLE customers_backup AS
SELECT * FROM customers;
-- 创建去重后的新表
CREATE TABLE customers_dedup AS
SELECT
MIN(id) as id, -- 保留最小的ID
first_name,
last_name,
email,
phone,
city,
MIN(created_at) as created_at -- 保留最早的创建时间
FROM customers
GROUP BY first_name, last_name, email, phone, city;
-- 查看去重结果
SELECT COUNT(*) as original_count FROM customers;
SELECT COUNT(*) as dedup_count FROM customers_dedup;
SELECT * FROM customers_dedup ORDER BY first_name, last_name;
-- 如果满意结果,可以替换原表
-- RENAME TABLE customers TO customers_old, customers_dedup TO customers;
条件性删除重复
-- 创建更复杂的重复场景
CREATE TABLE orders_with_duplicates (
id INT AUTO_INCREMENT PRIMARY KEY,
order_number VARCHAR(50),
customer_id INT,
order_date DATE,
amount DECIMAL(10,2),
status VARCHAR(20),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
INSERT INTO orders_with_duplicates
(order_number, customer_id, order_date, amount, status) VALUES
('ORD001', 1, '2024-01-15', 299.99, 'completed'),
('ORD001', 1, '2024-01-15', 299.99, 'pending'), -- 重复但状态不同
('ORD002', 2, '2024-01-16', 149.50, 'completed'),
('ORD003', 3, '2024-01-17', 89.99, 'completed'),
('ORD003', 3, '2024-01-17', 89.99, 'completed'), -- 完全重复
('ORD004', 1, '2024-01-18', 199.99, 'cancelled'),
('ORD004', 1, '2024-01-18', 199.99, 'completed'); -- 重复但状态不同
-- 删除重复订单,优先保留已完成的订单
DELETE o1 FROM orders_with_duplicates o1
INNER JOIN orders_with_duplicates o2
WHERE o1.id > o2.id
AND o1.order_number = o2.order_number
AND o1.customer_id = o2.customer_id
AND o1.order_date = o2.order_date
AND o1.amount = o2.amount
AND (
o1.status = o2.status -- 完全重复
OR (o1.status != 'completed' AND o2.status = 'completed') -- 保留completed状态
);
-- 查看结果
SELECT * FROM orders_with_duplicates ORDER BY order_number;
-- 更复杂的条件删除:保留最新的记录
DELETE o1 FROM orders_with_duplicates o1
INNER JOIN (
SELECT
order_number,
customer_id,
order_date,
amount,
MAX(created_at) as latest_created
FROM orders_with_duplicates
GROUP BY order_number, customer_id, order_date, amount
HAVING COUNT(*) > 1
) o2 ON o1.order_number = o2.order_number
AND o1.customer_id = o2.customer_id
AND o1.order_date = o2.order_date
AND o1.amount = o2.amount
AND o1.created_at < o2.latest_created;
-- 最终结果
SELECT * FROM orders_with_duplicates ORDER BY order_number;
高级重复处理技术
模糊匹配去重
-- 创建包含相似数据的表
CREATE TABLE contacts (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(100),
email VARCHAR(100),
phone VARCHAR(20),
company VARCHAR(100)
);
INSERT INTO contacts (name, email, phone, company) VALUES
('John Smith', 'john.smith@company.com', '123-456-7890', 'ABC Corp'),
('J. Smith', 'j.smith@company.com', '123-456-7890', 'ABC Corporation'),
('John A. Smith', 'john.smith@company.com', '(123) 456-7890', 'ABC Corp'),
('Jane Doe', 'jane.doe@email.com', '234-567-8901', 'XYZ Inc'),
('Jane M. Doe', 'jane.doe@email.com', '234.567.8901', 'XYZ Inc.');
-- 使用 SOUNDEX 进行音似匹配
SELECT
c1.id as id1,
c1.name as name1,
c2.id as id2,
c2.name as name2,
c1.phone,
SOUNDEX(c1.name) as soundex1,
SOUNDEX(c2.name) as soundex2
FROM contacts c1
JOIN contacts c2 ON c1.id < c2.id
WHERE SOUNDEX(c1.name) = SOUNDEX(c2.name)
OR (c1.phone = c2.phone AND c1.phone IS NOT NULL);
-- 标准化电话号码进行比较
SELECT
id,
name,
phone,
REGEXP_REPLACE(phone, '[^0-9]', '') as normalized_phone
FROM contacts;
-- 基于标准化电话号码查找重复
WITH normalized_contacts AS (
SELECT
id,
name,
email,
phone,
REGEXP_REPLACE(phone, '[^0-9]', '') as clean_phone,
company
FROM contacts
)
SELECT
c1.id,
c1.name,
c1.email,
c1.phone,
c1.company,
'DUPLICATE_PHONE' as match_reason
FROM normalized_contacts c1
JOIN normalized_contacts c2 ON c1.id > c2.id
WHERE c1.clean_phone = c2.clean_phone
AND c1.clean_phone != '';
批量去重处理
-- 创建批量去重存储过程
DELIMITER //
CREATE PROCEDURE DeduplicateTable(
IN table_name VARCHAR(64),
IN key_columns TEXT,
IN keep_criteria VARCHAR(255)
)
BEGIN
DECLARE sql_stmt TEXT;
DECLARE backup_table VARCHAR(64);
-- 创建备份表名
SET backup_table = CONCAT(table_name, '_backup_', DATE_FORMAT(NOW(), '%Y%m%d_%H%i%s'));
-- 备份原表
SET sql_stmt = CONCAT('CREATE TABLE ', backup_table, ' AS SELECT * FROM ', table_name);
SET @sql = sql_stmt;
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 执行去重
SET sql_stmt = CONCAT(
'DELETE t1 FROM ', table_name, ' t1 ',
'INNER JOIN ', table_name, ' t2 ',
'WHERE t1.id > t2.id ',
'AND (', key_columns, ') ',
IFNULL(CONCAT('AND ', keep_criteria), '')
);
SET @sql = sql_stmt;
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 返回结果
SELECT
table_name as processed_table,
backup_table as backup_created,
ROW_COUNT() as records_removed;
END //
DELIMITER ;
-- 使用存储过程进行去重
-- CALL DeduplicateTable(
-- 'customers',
-- 't1.first_name = t2.first_name AND t1.last_name = t2.last_name AND t1.email = t2.email',
-- NULL
-- );
-- 创建去重报告函数
DELIMITER //
CREATE FUNCTION GenerateDeduplicationReport(table_name VARCHAR(64))
RETURNS JSON
READS SQL DATA
DETERMINISTIC
BEGIN
DECLARE total_records INT;
DECLARE unique_records INT;
DECLARE duplicate_records INT;
DECLARE report JSON;
-- 这里简化处理,实际应用中需要动态构建SQL
SELECT COUNT(*) INTO total_records FROM customers;
SELECT COUNT(*) INTO unique_records
FROM (
SELECT DISTINCT first_name, last_name, email, phone, city
FROM customers
) as unique_data;
SET duplicate_records = total_records - unique_records;
SET report = JSON_OBJECT(
'table_name', table_name,
'total_records', total_records,
'unique_records', unique_records,
'duplicate_records', duplicate_records,
'duplication_rate', ROUND((duplicate_records / total_records) * 100, 2),
'analysis_date', NOW()
);
RETURN report;
END //
DELIMITER ;
-- 生成去重报告
SELECT GenerateDeduplicationReport('customers') as deduplication_report;
实时重复检测
-- 创建重复检测日志表
CREATE TABLE duplicate_detection_log (
id INT AUTO_INCREMENT PRIMARY KEY,
table_name VARCHAR(64),
record_id INT,
duplicate_type VARCHAR(50),
duplicate_details JSON,
detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
resolved BOOLEAN DEFAULT FALSE
);
-- 创建重复检测触发器
DELIMITER //
CREATE TRIGGER check_customer_duplicates
AFTER INSERT ON customers
FOR EACH ROW
BEGIN
DECLARE duplicate_count INT;
DECLARE duplicate_ids TEXT;
-- 检查邮箱重复
SELECT COUNT(*), GROUP_CONCAT(id)
INTO duplicate_count, duplicate_ids
FROM customers
WHERE email = NEW.email AND id != NEW.id;
IF duplicate_count > 0 THEN
INSERT INTO duplicate_detection_log
(table_name, record_id, duplicate_type, duplicate_details)
VALUES (
'customers',
NEW.id,
'EMAIL_DUPLICATE',
JSON_OBJECT(
'email', NEW.email,
'duplicate_ids', duplicate_ids,
'duplicate_count', duplicate_count
)
);
END IF;
-- 检查电话重复
SELECT COUNT(*), GROUP_CONCAT(id)
INTO duplicate_count, duplicate_ids
FROM customers
WHERE phone = NEW.phone AND id != NEW.id AND phone IS NOT NULL;
IF duplicate_count > 0 THEN
INSERT INTO duplicate_detection_log
(table_name, record_id, duplicate_type, duplicate_details)
VALUES (
'customers',
NEW.id,
'PHONE_DUPLICATE',
JSON_OBJECT(
'phone', NEW.phone,
'duplicate_ids', duplicate_ids,
'duplicate_count', duplicate_count
)
);
END IF;
END //
DELIMITER ;
-- 测试实时检测
INSERT INTO customers (first_name, last_name, email, phone, city) VALUES
('Test', 'User', 'john.doe@email.com', '999-888-7777', 'Test City');
-- 查看检测日志
SELECT
id,
table_name,
record_id,
duplicate_type,
JSON_PRETTY(duplicate_details) as details,
detected_at
FROM duplicate_detection_log
ORDER BY detected_at DESC;
重复处理策略比较
方法 | 性能 | 安全性 | 灵活性 | 复杂度 | 适用场景 |
---|---|---|---|---|---|
唯一约束 | 很高 | 很高 | 低 | 很低 | 预防重复,简单场景 |
INSERT IGNORE | 高 | 中等 | 中等 | 低 | 批量导入,忽略重复 |
ON DUPLICATE KEY UPDATE | 高 | 高 | 很高 | 中等 | 更新重复记录 |
REPLACE INTO | 高 | 中等 | 中等 | 低 | 完全替换重复记录 |
自连接删除 | 中等 | 低 | 高 | 中等 | 清理现有重复 |
窗口函数删除 | 中等 | 中等 | 很高 | 高 | 复杂去重逻辑 |
重建表 | 低 | 很高 | 很高 | 中等 | 大量重复数据 |
最佳实践和注意事项
重复数据处理最佳实践:
- 预防优于治疗:设计阶段就考虑唯一性约束
- 定期监控:建立重复数据检测机制
- 备份先行:删除重复数据前必须备份
- 分步处理:大量数据分批处理避免锁表
- 测试验证:在测试环境验证去重逻辑
- 文档记录:记录去重规则和决策依据
注意事项和风险:
- 删除操作不可逆,务必先备份数据
- 大表去重可能导致长时间锁表
- 外键约束可能阻止重复记录删除
- 自连接删除在大表上性能较差
- 模糊匹配可能误删有效数据
- 并发环境下需要考虑事务隔离
- 触发器可能影响插入性能
数据量 | 推荐方法 | 注意事项 |
---|---|---|
小表 (<1万) | 自连接删除 | 简单直接 |
中表 (1-100万) | 窗口函数 + 分批 | 控制批次大小 |
大表 (>100万) | 重建表 | 离线处理 |
实时系统 | 唯一约束 + 触发器 | 性能监控 |
批量导入 | INSERT IGNORE | 数据验证 |
-- 清理示例对象
DROP TRIGGER IF EXISTS check_customer_duplicates;
DROP TABLE IF EXISTS duplicate_detection_log;
DROP TABLE IF EXISTS contacts;
DROP TABLE IF EXISTS orders_with_duplicates;
DROP TABLE IF EXISTS customers_dedup;
DROP TABLE IF EXISTS customers_backup;
DROP TABLE IF EXISTS test_duplicates;
DROP TABLE IF EXISTS products;
DROP TABLE IF EXISTS customers_clean;
DROP TABLE IF EXISTS users_unique;
DROP TABLE IF EXISTS customers;
DROP FUNCTION IF EXISTS GenerateDeduplicationReport;
DROP PROCEDURE IF EXISTS DeduplicateTable;
SELECT 'Duplicate handling tutorial completed. All example objects cleaned up.' AS message;