🔄 MySQL 处理重复数据

数据去重和重复处理 - 保持数据的唯一性和完整性

重复数据概述

重复数据是数据库中常见的问题,可能由于数据导入错误、应用程序逻辑缺陷、并发插入等原因产生。有效处理重复数据对于维护数据质量和系统性能至关重要。

🔍 识别重复

使用 GROUP BY 和聚合函数找出重复记录

🚫 防止重复

通过约束和索引预防重复数据产生

🗑️ 删除重复

使用各种技术删除已存在的重复记录

🔄 处理重复

在插入时智能处理重复数据

重复数据的常见原因:
  • 数据导入时缺乏去重机制
  • 应用程序的重复提交
  • 并发操作导致的竞态条件
  • 数据同步过程中的错误
  • 缺乏适当的唯一性约束
  • ETL 过程中的数据重复

识别重复数据

准备示例数据

-- 创建示例表 CREATE TABLE customers ( id INT AUTO_INCREMENT PRIMARY KEY, first_name VARCHAR(50), last_name VARCHAR(50), email VARCHAR(100), phone VARCHAR(20), city VARCHAR(50), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); -- 插入包含重复数据的示例 INSERT INTO customers (first_name, last_name, email, phone, city) VALUES ('John', 'Doe', 'john.doe@email.com', '123-456-7890', 'New York'), ('Jane', 'Smith', 'jane.smith@email.com', '234-567-8901', 'Los Angeles'), ('John', 'Doe', 'john.doe@email.com', '123-456-7890', 'New York'), -- 完全重复 ('Jane', 'Smith', 'jane.smith@gmail.com', '234-567-8901', 'Los Angeles'), -- 邮箱不同 ('Bob', 'Johnson', 'bob.johnson@email.com', '345-678-9012', 'Chicago'), ('John', 'Doe', 'john.d@email.com', '123-456-7890', 'New York'), -- 邮箱不同 ('Alice', 'Brown', 'alice.brown@email.com', '456-789-0123', 'Houston'), ('Bob', 'Johnson', 'bob.johnson@email.com', '345-678-9012', 'Chicago'), -- 完全重复 ('Charlie', 'Wilson', 'charlie.wilson@email.com', '567-890-1234', 'Phoenix'), ('Alice', 'Brown', 'alice.b@email.com', '456-789-0123', 'Houston'); -- 邮箱不同 -- 查看所有数据 SELECT * FROM customers ORDER BY first_name, last_name;

基本重复检测

-- 查找完全重复的记录(除了ID) SELECT first_name, last_name, email, phone, city, COUNT(*) as duplicate_count FROM customers GROUP BY first_name, last_name, email, phone, city HAVING COUNT(*) > 1 ORDER BY duplicate_count DESC; -- 查找基于姓名的重复 SELECT first_name, last_name, COUNT(*) as name_duplicates, GROUP_CONCAT(DISTINCT email ORDER BY email) as emails, GROUP_CONCAT(DISTINCT phone ORDER BY phone) as phones FROM customers GROUP BY first_name, last_name HAVING COUNT(*) > 1 ORDER BY name_duplicates DESC; -- 查找基于邮箱的重复 SELECT email, COUNT(*) as email_duplicates, GROUP_CONCAT(CONCAT(first_name, ' ', last_name) ORDER BY id) as names FROM customers GROUP BY email HAVING COUNT(*) > 1; -- 查找基于电话的重复 SELECT phone, COUNT(*) as phone_duplicates, GROUP_CONCAT(CONCAT(first_name, ' ', last_name) ORDER BY id) as names FROM customers GROUP BY phone HAVING COUNT(*) > 1;

高级重复检测

-- 使用窗口函数标识重复记录 SELECT id, first_name, last_name, email, phone, ROW_NUMBER() OVER ( PARTITION BY first_name, last_name, email, phone, city ORDER BY id ) as row_num, COUNT(*) OVER ( PARTITION BY first_name, last_name, email, phone, city ) as total_duplicates FROM customers ORDER BY first_name, last_name, id; -- 查找除了最早记录外的所有重复记录 WITH duplicate_analysis AS ( SELECT id, first_name, last_name, email, phone, city, ROW_NUMBER() OVER ( PARTITION BY first_name, last_name, email, phone, city ORDER BY id ) as row_num FROM customers ) SELECT id, first_name, last_name, email, 'DUPLICATE' as status FROM duplicate_analysis WHERE row_num > 1 ORDER BY first_name, last_name; -- 模糊匹配检测(相似但不完全相同的记录) SELECT c1.id as id1, c2.id as id2, c1.first_name, c1.last_name, c1.email as email1, c2.email as email2, c1.phone, 'SIMILAR_RECORDS' as match_type FROM customers c1 JOIN customers c2 ON c1.id < c2.id WHERE c1.first_name = c2.first_name AND c1.last_name = c2.last_name AND c1.phone = c2.phone AND c1.email != c2.email -- 邮箱不同但其他信息相同 ORDER BY c1.first_name, c1.last_name;

重复数据统计报告

-- 生成重复数据统计报告 SELECT 'Total Records' as metric, COUNT(*) as count FROM customers UNION ALL SELECT 'Unique Records (by name+email+phone)', COUNT(*) FROM ( SELECT DISTINCT first_name, last_name, email, phone, city FROM customers ) as unique_records UNION ALL SELECT 'Duplicate Records', COUNT(*) - ( SELECT COUNT(*) FROM ( SELECT DISTINCT first_name, last_name, email, phone, city FROM customers ) as unique_records ) FROM customers UNION ALL SELECT 'Duplicate Groups', COUNT(*) FROM ( SELECT first_name, last_name, email, phone, city FROM customers GROUP BY first_name, last_name, email, phone, city HAVING COUNT(*) > 1 ) as duplicate_groups; -- 按重复类型分类统计 SELECT 'Name Duplicates' as duplicate_type, COUNT(*) as groups, SUM(duplicate_count - 1) as extra_records FROM ( SELECT COUNT(*) as duplicate_count FROM customers GROUP BY first_name, last_name HAVING COUNT(*) > 1 ) as name_dups UNION ALL SELECT 'Email Duplicates', COUNT(*), SUM(duplicate_count - 1) FROM ( SELECT COUNT(*) as duplicate_count FROM customers GROUP BY email HAVING COUNT(*) > 1 ) as email_dups UNION ALL SELECT 'Phone Duplicates', COUNT(*), SUM(duplicate_count - 1) FROM ( SELECT COUNT(*) as duplicate_count FROM customers GROUP BY phone HAVING COUNT(*) > 1 ) as phone_dups;

防止重复数据

使用唯一约束

-- 创建带有唯一约束的表 CREATE TABLE users_unique ( id INT AUTO_INCREMENT PRIMARY KEY, username VARCHAR(50) NOT NULL UNIQUE, email VARCHAR(100) NOT NULL UNIQUE, phone VARCHAR(20), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- 复合唯一约束 UNIQUE KEY uk_phone_email (phone, email) ); -- 为现有表添加唯一约束 CREATE TABLE customers_clean AS SELECT DISTINCT first_name, last_name, email, phone, city FROM customers; ALTER TABLE customers_clean ADD COLUMN id INT AUTO_INCREMENT PRIMARY KEY FIRST; ALTER TABLE customers_clean ADD UNIQUE KEY uk_email (email); ALTER TABLE customers_clean ADD UNIQUE KEY uk_phone (phone); ALTER TABLE customers_clean ADD UNIQUE KEY uk_name_phone (first_name, last_name, phone); -- 测试唯一约束 INSERT INTO users_unique (username, email, phone) VALUES ('john_doe', 'john@example.com', '123-456-7890'); -- 这将失败,因为邮箱重复 -- INSERT INTO users_unique (username, email, phone) VALUES -- ('jane_doe', 'john@example.com', '234-567-8901'); -- 查看约束信息 SELECT CONSTRAINT_NAME, COLUMN_NAME, CONSTRAINT_TYPE FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'users_unique' AND CONSTRAINT_NAME != 'PRIMARY' ORDER BY CONSTRAINT_NAME;

使用 INSERT IGNORE

-- INSERT IGNORE 会忽略重复键错误 INSERT IGNORE INTO users_unique (username, email, phone) VALUES ('john_doe2', 'john@example.com', '123-456-7890'), -- 邮箱重复,会被忽略 ('jane_smith', 'jane@example.com', '234-567-8901'), -- 新记录,会插入 ('bob_wilson', 'john@example.com', '345-678-9012'); -- 邮箱重复,会被忽略 -- 查看结果 SELECT * FROM users_unique; -- 查看受影响的行数 SELECT ROW_COUNT() as affected_rows; -- 批量插入时使用 INSERT IGNORE INSERT IGNORE INTO customers_clean (first_name, last_name, email, phone, city) SELECT first_name, last_name, email, phone, city FROM customers; -- 查看插入结果 SELECT COUNT(*) as total_inserted FROM customers_clean;

使用 ON DUPLICATE KEY UPDATE

-- 创建带有更新时间的表 CREATE TABLE products ( id INT AUTO_INCREMENT PRIMARY KEY, product_code VARCHAR(50) UNIQUE, product_name VARCHAR(100), price DECIMAL(10,2), stock_quantity INT DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP ); -- 使用 ON DUPLICATE KEY UPDATE INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES ('PROD001', 'Product A', 99.99, 100), ('PROD002', 'Product B', 149.99, 50), ('PROD003', 'Product C', 199.99, 25) ON DUPLICATE KEY UPDATE product_name = VALUES(product_name), price = VALUES(price), stock_quantity = stock_quantity + VALUES(stock_quantity), updated_at = CURRENT_TIMESTAMP; -- 再次插入相同的产品代码(会更新而不是插入) INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES ('PROD001', 'Updated Product A', 109.99, 50), ('PROD004', 'Product D', 79.99, 75) ON DUPLICATE KEY UPDATE product_name = VALUES(product_name), price = VALUES(price), stock_quantity = stock_quantity + VALUES(stock_quantity), updated_at = CURRENT_TIMESTAMP; -- 查看结果 SELECT * FROM products ORDER BY product_code; -- 更复杂的 ON DUPLICATE KEY UPDATE 示例 INSERT INTO products (product_code, product_name, price, stock_quantity) VALUES ('PROD001', 'Product A v2', 119.99, 25) ON DUPLICATE KEY UPDATE product_name = CASE WHEN VALUES(price) > price THEN VALUES(product_name) ELSE product_name END, price = GREATEST(price, VALUES(price)), stock_quantity = stock_quantity + VALUES(stock_quantity), updated_at = CURRENT_TIMESTAMP;

使用 REPLACE INTO

-- REPLACE INTO 会删除重复记录然后插入新记录 REPLACE INTO products (product_code, product_name, price, stock_quantity) VALUES ('PROD001', 'Replaced Product A', 129.99, 200), ('PROD005', 'Product E', 59.99, 150); -- 查看结果(注意 PROD001 的 ID 可能会改变) SELECT * FROM products ORDER BY product_code; -- 批量 REPLACE CREATE TEMPORARY TABLE temp_products ( product_code VARCHAR(50), product_name VARCHAR(100), price DECIMAL(10,2), stock_quantity INT ); INSERT INTO temp_products VALUES ('PROD001', 'Final Product A', 139.99, 300), ('PROD002', 'Final Product B', 159.99, 100), ('PROD006', 'Product F', 89.99, 80); REPLACE INTO products (product_code, product_name, price, stock_quantity) SELECT product_code, product_name, price, stock_quantity FROM temp_products; -- 查看最终结果 SELECT * FROM products ORDER BY product_code;

删除重复数据

使用自连接删除重复

-- 创建包含重复数据的测试表 CREATE TABLE test_duplicates AS SELECT * FROM customers; -- 查看重复数据 SELECT first_name, last_name, email, phone, city, COUNT(*) as count FROM test_duplicates GROUP BY first_name, last_name, email, phone, city HAVING COUNT(*) > 1; -- 方法1:使用自连接删除重复(保留ID最小的记录) DELETE t1 FROM test_duplicates t1 INNER JOIN test_duplicates t2 WHERE t1.id > t2.id AND t1.first_name = t2.first_name AND t1.last_name = t2.last_name AND t1.email = t2.email AND t1.phone = t2.phone AND t1.city = t2.city; -- 查看删除结果 SELECT COUNT(*) as remaining_records FROM test_duplicates; SELECT * FROM test_duplicates ORDER BY first_name, last_name;

使用窗口函数删除重复

-- 重新创建测试数据 DROP TABLE test_duplicates; CREATE TABLE test_duplicates AS SELECT * FROM customers; -- 方法2:使用窗口函数和临时表 CREATE TEMPORARY TABLE temp_keep_ids AS SELECT id FROM ( SELECT id, ROW_NUMBER() OVER ( PARTITION BY first_name, last_name, email, phone, city ORDER BY id ) as row_num FROM test_duplicates ) ranked WHERE row_num = 1; -- 删除不在保留列表中的记录 DELETE FROM test_duplicates WHERE id NOT IN (SELECT id FROM temp_keep_ids); -- 查看结果 SELECT COUNT(*) as remaining_records FROM test_duplicates; -- 验证没有重复 SELECT first_name, last_name, email, phone, city, COUNT(*) as count FROM test_duplicates GROUP BY first_name, last_name, email, phone, city HAVING COUNT(*) > 1;

使用 GROUP BY 重建表

-- 方法3:重建表(最安全的方法) -- 备份原表 CREATE TABLE customers_backup AS SELECT * FROM customers; -- 创建去重后的新表 CREATE TABLE customers_dedup AS SELECT MIN(id) as id, -- 保留最小的ID first_name, last_name, email, phone, city, MIN(created_at) as created_at -- 保留最早的创建时间 FROM customers GROUP BY first_name, last_name, email, phone, city; -- 查看去重结果 SELECT COUNT(*) as original_count FROM customers; SELECT COUNT(*) as dedup_count FROM customers_dedup; SELECT * FROM customers_dedup ORDER BY first_name, last_name; -- 如果满意结果,可以替换原表 -- RENAME TABLE customers TO customers_old, customers_dedup TO customers;

条件性删除重复

-- 创建更复杂的重复场景 CREATE TABLE orders_with_duplicates ( id INT AUTO_INCREMENT PRIMARY KEY, order_number VARCHAR(50), customer_id INT, order_date DATE, amount DECIMAL(10,2), status VARCHAR(20), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); INSERT INTO orders_with_duplicates (order_number, customer_id, order_date, amount, status) VALUES ('ORD001', 1, '2024-01-15', 299.99, 'completed'), ('ORD001', 1, '2024-01-15', 299.99, 'pending'), -- 重复但状态不同 ('ORD002', 2, '2024-01-16', 149.50, 'completed'), ('ORD003', 3, '2024-01-17', 89.99, 'completed'), ('ORD003', 3, '2024-01-17', 89.99, 'completed'), -- 完全重复 ('ORD004', 1, '2024-01-18', 199.99, 'cancelled'), ('ORD004', 1, '2024-01-18', 199.99, 'completed'); -- 重复但状态不同 -- 删除重复订单,优先保留已完成的订单 DELETE o1 FROM orders_with_duplicates o1 INNER JOIN orders_with_duplicates o2 WHERE o1.id > o2.id AND o1.order_number = o2.order_number AND o1.customer_id = o2.customer_id AND o1.order_date = o2.order_date AND o1.amount = o2.amount AND ( o1.status = o2.status -- 完全重复 OR (o1.status != 'completed' AND o2.status = 'completed') -- 保留completed状态 ); -- 查看结果 SELECT * FROM orders_with_duplicates ORDER BY order_number; -- 更复杂的条件删除:保留最新的记录 DELETE o1 FROM orders_with_duplicates o1 INNER JOIN ( SELECT order_number, customer_id, order_date, amount, MAX(created_at) as latest_created FROM orders_with_duplicates GROUP BY order_number, customer_id, order_date, amount HAVING COUNT(*) > 1 ) o2 ON o1.order_number = o2.order_number AND o1.customer_id = o2.customer_id AND o1.order_date = o2.order_date AND o1.amount = o2.amount AND o1.created_at < o2.latest_created; -- 最终结果 SELECT * FROM orders_with_duplicates ORDER BY order_number;

高级重复处理技术

模糊匹配去重

-- 创建包含相似数据的表 CREATE TABLE contacts ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(100), email VARCHAR(100), phone VARCHAR(20), company VARCHAR(100) ); INSERT INTO contacts (name, email, phone, company) VALUES ('John Smith', 'john.smith@company.com', '123-456-7890', 'ABC Corp'), ('J. Smith', 'j.smith@company.com', '123-456-7890', 'ABC Corporation'), ('John A. Smith', 'john.smith@company.com', '(123) 456-7890', 'ABC Corp'), ('Jane Doe', 'jane.doe@email.com', '234-567-8901', 'XYZ Inc'), ('Jane M. Doe', 'jane.doe@email.com', '234.567.8901', 'XYZ Inc.'); -- 使用 SOUNDEX 进行音似匹配 SELECT c1.id as id1, c1.name as name1, c2.id as id2, c2.name as name2, c1.phone, SOUNDEX(c1.name) as soundex1, SOUNDEX(c2.name) as soundex2 FROM contacts c1 JOIN contacts c2 ON c1.id < c2.id WHERE SOUNDEX(c1.name) = SOUNDEX(c2.name) OR (c1.phone = c2.phone AND c1.phone IS NOT NULL); -- 标准化电话号码进行比较 SELECT id, name, phone, REGEXP_REPLACE(phone, '[^0-9]', '') as normalized_phone FROM contacts; -- 基于标准化电话号码查找重复 WITH normalized_contacts AS ( SELECT id, name, email, phone, REGEXP_REPLACE(phone, '[^0-9]', '') as clean_phone, company FROM contacts ) SELECT c1.id, c1.name, c1.email, c1.phone, c1.company, 'DUPLICATE_PHONE' as match_reason FROM normalized_contacts c1 JOIN normalized_contacts c2 ON c1.id > c2.id WHERE c1.clean_phone = c2.clean_phone AND c1.clean_phone != '';

批量去重处理

-- 创建批量去重存储过程 DELIMITER // CREATE PROCEDURE DeduplicateTable( IN table_name VARCHAR(64), IN key_columns TEXT, IN keep_criteria VARCHAR(255) ) BEGIN DECLARE sql_stmt TEXT; DECLARE backup_table VARCHAR(64); -- 创建备份表名 SET backup_table = CONCAT(table_name, '_backup_', DATE_FORMAT(NOW(), '%Y%m%d_%H%i%s')); -- 备份原表 SET sql_stmt = CONCAT('CREATE TABLE ', backup_table, ' AS SELECT * FROM ', table_name); SET @sql = sql_stmt; PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; -- 执行去重 SET sql_stmt = CONCAT( 'DELETE t1 FROM ', table_name, ' t1 ', 'INNER JOIN ', table_name, ' t2 ', 'WHERE t1.id > t2.id ', 'AND (', key_columns, ') ', IFNULL(CONCAT('AND ', keep_criteria), '') ); SET @sql = sql_stmt; PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; -- 返回结果 SELECT table_name as processed_table, backup_table as backup_created, ROW_COUNT() as records_removed; END // DELIMITER ; -- 使用存储过程进行去重 -- CALL DeduplicateTable( -- 'customers', -- 't1.first_name = t2.first_name AND t1.last_name = t2.last_name AND t1.email = t2.email', -- NULL -- ); -- 创建去重报告函数 DELIMITER // CREATE FUNCTION GenerateDeduplicationReport(table_name VARCHAR(64)) RETURNS JSON READS SQL DATA DETERMINISTIC BEGIN DECLARE total_records INT; DECLARE unique_records INT; DECLARE duplicate_records INT; DECLARE report JSON; -- 这里简化处理,实际应用中需要动态构建SQL SELECT COUNT(*) INTO total_records FROM customers; SELECT COUNT(*) INTO unique_records FROM ( SELECT DISTINCT first_name, last_name, email, phone, city FROM customers ) as unique_data; SET duplicate_records = total_records - unique_records; SET report = JSON_OBJECT( 'table_name', table_name, 'total_records', total_records, 'unique_records', unique_records, 'duplicate_records', duplicate_records, 'duplication_rate', ROUND((duplicate_records / total_records) * 100, 2), 'analysis_date', NOW() ); RETURN report; END // DELIMITER ; -- 生成去重报告 SELECT GenerateDeduplicationReport('customers') as deduplication_report;

实时重复检测

-- 创建重复检测日志表 CREATE TABLE duplicate_detection_log ( id INT AUTO_INCREMENT PRIMARY KEY, table_name VARCHAR(64), record_id INT, duplicate_type VARCHAR(50), duplicate_details JSON, detected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, resolved BOOLEAN DEFAULT FALSE ); -- 创建重复检测触发器 DELIMITER // CREATE TRIGGER check_customer_duplicates AFTER INSERT ON customers FOR EACH ROW BEGIN DECLARE duplicate_count INT; DECLARE duplicate_ids TEXT; -- 检查邮箱重复 SELECT COUNT(*), GROUP_CONCAT(id) INTO duplicate_count, duplicate_ids FROM customers WHERE email = NEW.email AND id != NEW.id; IF duplicate_count > 0 THEN INSERT INTO duplicate_detection_log (table_name, record_id, duplicate_type, duplicate_details) VALUES ( 'customers', NEW.id, 'EMAIL_DUPLICATE', JSON_OBJECT( 'email', NEW.email, 'duplicate_ids', duplicate_ids, 'duplicate_count', duplicate_count ) ); END IF; -- 检查电话重复 SELECT COUNT(*), GROUP_CONCAT(id) INTO duplicate_count, duplicate_ids FROM customers WHERE phone = NEW.phone AND id != NEW.id AND phone IS NOT NULL; IF duplicate_count > 0 THEN INSERT INTO duplicate_detection_log (table_name, record_id, duplicate_type, duplicate_details) VALUES ( 'customers', NEW.id, 'PHONE_DUPLICATE', JSON_OBJECT( 'phone', NEW.phone, 'duplicate_ids', duplicate_ids, 'duplicate_count', duplicate_count ) ); END IF; END // DELIMITER ; -- 测试实时检测 INSERT INTO customers (first_name, last_name, email, phone, city) VALUES ('Test', 'User', 'john.doe@email.com', '999-888-7777', 'Test City'); -- 查看检测日志 SELECT id, table_name, record_id, duplicate_type, JSON_PRETTY(duplicate_details) as details, detected_at FROM duplicate_detection_log ORDER BY detected_at DESC;

重复处理策略比较

方法 性能 安全性 灵活性 复杂度 适用场景
唯一约束 很高 很高 很低 预防重复,简单场景
INSERT IGNORE 中等 中等 批量导入,忽略重复
ON DUPLICATE KEY UPDATE 很高 中等 更新重复记录
REPLACE INTO 中等 中等 完全替换重复记录
自连接删除 中等 中等 清理现有重复
窗口函数删除 中等 中等 很高 复杂去重逻辑
重建表 很高 很高 中等 大量重复数据

最佳实践和注意事项

重复数据处理最佳实践:
  • 预防优于治疗:设计阶段就考虑唯一性约束
  • 定期监控:建立重复数据检测机制
  • 备份先行:删除重复数据前必须备份
  • 分步处理:大量数据分批处理避免锁表
  • 测试验证:在测试环境验证去重逻辑
  • 文档记录:记录去重规则和决策依据
注意事项和风险:
  • 删除操作不可逆,务必先备份数据
  • 大表去重可能导致长时间锁表
  • 外键约束可能阻止重复记录删除
  • 自连接删除在大表上性能较差
  • 模糊匹配可能误删有效数据
  • 并发环境下需要考虑事务隔离
  • 触发器可能影响插入性能
数据量推荐方法注意事项
小表 (<1万)自连接删除简单直接
中表 (1-100万)窗口函数 + 分批控制批次大小
大表 (>100万)重建表离线处理
实时系统唯一约束 + 触发器性能监控
批量导入INSERT IGNORE数据验证
-- 清理示例对象 DROP TRIGGER IF EXISTS check_customer_duplicates; DROP TABLE IF EXISTS duplicate_detection_log; DROP TABLE IF EXISTS contacts; DROP TABLE IF EXISTS orders_with_duplicates; DROP TABLE IF EXISTS customers_dedup; DROP TABLE IF EXISTS customers_backup; DROP TABLE IF EXISTS test_duplicates; DROP TABLE IF EXISTS products; DROP TABLE IF EXISTS customers_clean; DROP TABLE IF EXISTS users_unique; DROP TABLE IF EXISTS customers; DROP FUNCTION IF EXISTS GenerateDeduplicationReport; DROP PROCEDURE IF EXISTS DeduplicateTable; SELECT 'Duplicate handling tutorial completed. All example objects cleaned up.' AS message;