SQL
Find and Delete Duplicate Records (Non-Window Function Method)
Discover how to identify and remove duplicate rows in a table based on specific columns without relying on window functions. Preserve one unique record.
-- Sample table with potential duplicates
CREATE TABLE users (
id SERIAL PRIMARY KEY,
email VARCHAR(255) NOT NULL,
username VARCHAR(100) NOT NULL,
created_at TIMESTAMP DEFAULT NOW()
);
-- Insert sample data with duplicates
INSERT INTO users (email, username) VALUES
('[email protected]', 'alice'),
('[email protected]', 'bob'),
('[email protected]', 'alice'), -- Duplicate
('[email protected]', 'charlie'),
('[email protected]', 'bob'), -- Duplicate
('[email protected]', 'david'),
('[email protected]', 'alice'); -- Another duplicate
-- Find all duplicate records based on email and username
SELECT email, username, COUNT(*)
FROM users
GROUP BY email, username
HAVING COUNT(*) > 1;
-- Find all records that are duplicates (including the first occurrence)
-- This shows ALL rows involved in duplicates
SELECT u1.*
FROM users u1
JOIN (
SELECT email, username
FROM users
GROUP BY email, username
HAVING COUNT(*) > 1
) AS duplicates ON u1.email = duplicates.email AND u1.username = duplicates.username
ORDER BY u1.email, u1.username, u1.id;
-- Delete duplicate records, keeping the one with the MINIMUM 'id'
-- (This approach works for PostgreSQL and similar, might vary for MySQL/SQL Server)
DELETE FROM users
WHERE id IN (
SELECT u_inner.id
FROM users u_inner
JOIN (
SELECT email, username, MIN(id) as min_id
FROM users
GROUP BY email, username
HAVING COUNT(*) > 1
) AS unique_records
ON u_inner.email = unique_records.email
AND u_inner.username = unique_records.username
WHERE u_inner.id > unique_records.min_id -- Keep the min_id, delete others
);
-- Verify remaining records
SELECT * FROM users ORDER BY id;
How it works: This snippet shows how to identify and delete duplicate records in a table *without* using window functions, which were explicitly excluded. First, it demonstrates how to find combinations of `email` and `username` that appear more than once using `GROUP BY` and `HAVING`. Then, it shows a method to delete these duplicates while preserving one record (in this case, the one with the `MIN(id)` for each duplicate set) by using a subquery to identify the `id`s of the rows to be removed. This is a common data cleaning task for maintaining data integrity.