Deleting Duplicate Data From SQL Server Table
Duplicate data is a very well known and frequently occurring issue in any DBMS and dealing with deletion of duplicate data is something not that easy for one specially one who is dealing with crucial and huge data.
There are 2 scenarios which can come while deleting duplicate data -
1. when we have an identity column in our table containing duplicates.
The problem becomes much easier here and we can just keep the data with MIN or MAX keys removing all other.
Use Below Query --
2. when we don't have any unique identifier in our table and it's a bit tricky here -
there are 2 ways of doing it here again --
(i) using the Row_Number() function comes with only SQL Server 2005 and later versions
(ii) Here is some piece of code which i will suggest as best to deal with this and can be used anywhere in even the older version of SQL Server like 2000 without using any special functions.
Please do comment if you find this post useful, suggestion are appreciated.
Happy Coding !!
There are 2 scenarios which can come while deleting duplicate data -
1. when we have an identity column in our table containing duplicates.
The problem becomes much easier here and we can just keep the data with MIN or MAX keys removing all other.
--Create New
Table
create table Person
(
ID int
identity(1,1),
Name varchar(30),
age int
)
--Insert Data
into Table with multiple Duplicate entries
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Gaurav',26)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
--Check the data
before deleting duplicates
select * from Person
DELETE FROM Person
WHERE ID NOT IN
(
SELECT MIN(ID) FROM Person
GROUP BY Name,age HAVING COUNT(1) > 1
UNION
SELECT MIN(ID) FROM Person
GROUP BY Name,age HAVING COUNT(1) = 1
)
--Check the data
After deleting duplicates
select * from Person
Use Below Query --
2. when we don't have any unique identifier in our table and it's a bit tricky here -
there are 2 ways of doing it here again --
(i) using the Row_Number() function comes with only SQL Server 2005 and later versions
--Check if Table
already exists
IF EXISTS(select * from INFORMATION_SCHEMA.TABLES where
TABLE_NAME = 'Person')
BEGIN
--Drop if already exists
Drop Table
Person
END
--Create New
Table
create table Person
(
Name varchar(30),
age int
)
--Insert Data
into Table with multiple Duplicate entries
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Gaurav',26)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
--Check the data
before deleting duplicates
select * from Person
;WITH CTE (Name,Age, DuplicateCount)
AS
(
SELECT Name,Age,
ROW_NUMBER() OVER(PARTITION BY Name,Age ORDER BY Name) AS DuplicateCount
FROM Person
)
DELETE
FROM CTE
WHERE
DuplicateCount > 1
GO
--Check the data
After deleting duplicates
select * from Person
(ii) Here is some piece of code which i will suggest as best to deal with this and can be used anywhere in even the older version of SQL Server like 2000 without using any special functions.
--Check if Table
already exists
IF EXISTS(select * from INFORMATION_SCHEMA.TABLES where
TABLE_NAME = 'Person')
BEGIN
--Drop if already exists
Drop Table
Person
END
--Create New
Table
create table Person
(
Name varchar(30),
age int
)
--Insert Data
into Table with multiple Duplicate entries
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Ashish',24)
insert into Person(Name,age) values('Gaurav',26)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
insert into Person(Name,age) values('Nayan',25)
--Check the data
before deleting duplicates
select * from Person
--Declare 2
variables as @name (which is duplicate key) and @duplicateCount as Number of
Duplicates
DECLARE @name varchar(30), @duplicateCount int
--Declare a
Cursor to iterate through each duplicate name
DECLARE
Cursor_Person CURSOR FOR
select Name,(count(1)-1) cnt
from Person
group by Name
having count(1) > 1
OPEN
Cursor_Person
FETCH NEXT FROM
Cursor_Person INTO @name, @duplicateCount
WHILE @@FETCH_STATUS = 0
BEGIN
--delete all
duplicate entries for each duplicate key keeping only one row
delete top (@duplicateCount) from Person where Name = @name
FETCH NEXT FROM
Cursor_Person INTO @name, @duplicateCount
END
CLOSE
Cursor_Person
DEALLOCATE
Cursor_Person
--Check the data
After deleting duplicates
select * from Person
Happy Coding !!
Good work man....
ReplyDeleteThanks !!
Delete