다양한 포맷의 이름 파싱 하기
- Version : SQL Server 2005, 2008, 2008R2, 2012, 2014
비즈니스 파트마다 다양한 형식의 이름(또는 전화번호) 세트를 사용한다. 이러한 경우 성과 이름을 어떻게 구분하여 정리 업무를 수행하는 사람이라면 한 번쯤 고민해보았을 것이다.
이번 시간에는 구분 할 수 있는 특정한 기호를 기준으로 이름을 파싱하여 사용하는 방법에 대해서 알아본다.
[공백으로 구분된 경우]
성과 이름 사이에 공백이 있는 포맷을 파싱하여 사용하는 방법이다. 실습용 테이블을 생성하고 데이터를 입력 한다.
-- create temporary table for storing source name strings -- and their associated name parts IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL DROP TABLE ##NamesFromDifferentClients_1
CREATE TABLE ##NamesFromDifferentClients_1 ( ID BIGINT IDENTITY(1,1), ClientID varchar(20), SourceString varchar(75), FirstName varchar(20), MiddleName varchar(20), LastName varchar(30), Suffix varchar(5) )
-- populate temporary table with source name strings INSERT INTO ##NamesFromDifferentClients_1 ( ClientID, SourceString ) VALUES ('A', 'Tim Bits'), ('A', 'Ken dePaul Jones'), ('A', 'Sally S Cats'), ('A', 'Mike George Mountains JR') |
다음 스크립트를 실행하면 공백을 기준으로 파싱할 문자열의 길이를 확인 할 수 있다.
SELECT ID, ClientID, SourceString, CHARINDEX(' ',SourceString,1) space_1_loc, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) space_2_loc, CASE WHEN CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1)=0 THEN 0 ELSE CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) +1) END space_3_loc FROM ##NamesFromDifferentClients_1 |
파싱할 문자열의 길이에 따라 문자를 추출하고 공백을 제거하면 성과 이름을 구분한다.
-- map extracted strings to name parts SELECT ID ,ClientID ,SourceString ,first_string FirstName ,CASE WHEN third_string = '' THEN CAST('' AS varchar(20)) ELSE CAST(second_string AS varchar(20)) END MiddleName ,CASE WHEN third_string = '' THEN CAST(second_string AS varchar(30)) ELSE CAST(third_string AS varchar(30)) END LastName -- Remove leading blank ,LTRIM(CASE WHEN fourth_string = '' THEN CAST('' AS varchar(5)) ELSE CAST(fourth_string AS varchar(5)) END) Suffix FROM ( -- extract strings SELECT ID ,ClientID ,SourceString ,LEFT(SourceString,space_1_loc-1) first_string ,CASE WHEN space_2_loc > 0 THEN CAST( SUBSTRING(SourceString, space_1_loc+1,space_2_loc-space_1_loc) AS varchar(20) ) ELSE CAST( RIGHT(SourceString, DATALENGTH(SourceString)-space_1_loc) AS varchar(20) ) END second_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN SUBSTRING(SourceString,space_2_loc+1, DATALENGTH(SourceString)) WHEN space_3_loc > 0 THEN CAST( SUBSTRING(SourceString, space_2_loc+1, space_3_loc-space_2_loc) AS varchar(30) ) END third_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN CAST('' AS varchar(30)) ELSE CAST( SUBSTRING(SourceString, space_3_loc, DATALENGTH(SourceString)) AS varchar(5)) END fourth_string FROM ( -- extract delimiter locations SELECT ID, ClientID, SourceString, CHARINDEX(' ',SourceString,1) space_1_loc, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) space_2_loc, CASE WHEN CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1)=0 THEN 0 ELSE CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) +1) END space_3_loc FROM ##NamesFromDifferentClients_1 ) SourceAndDelimiters ) SourceAndExtractedStrings |
[콤마와 공백으로 구분된 경우]
성에는 콤마와 공백으로 구분되어 있고 중간 이름과 마지막 이름은 공백으로 구분되어 있는 경우이다. 실습용 테이블을 생성하고 데이터를 입력 한다.
-- create temporary table for storing source name strings -- and their associated name parts IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL DROP TABLE ##NamesFromDifferentClients_1
CREATE TABLE ##NamesFromDifferentClients_1 ( ID BIGINT IDENTITY(1,1), ClientID varchar(20), SourceString varchar(75), FirstName varchar(20), MiddleName varchar(20), LastName varchar(30), Suffix varchar(5) )
INSERT INTO ##NamesFromDifferentClients_1 ( ClientID, SourceString ) VALUES ('B', 'Bits, Tim'), ('B', 'Jones, Ken dePaul'), ('B', 'Cats, Sally S'), ('B', 'Mountains JR, Mike George'), ('B', 'Mountains, Mary Anne Bits') |
다음 스크립트를 실행하면 콤마와 공백을 기준으로 파싱할 문자열의 길이를 확인 할 수 있다.
SELECT ID, ClientID, SourceString, CHARINDEX(',',SourceString,1) comma_loc, CHARINDEX(' ',SourceString,1) space_1_loc, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) space_2_loc, CASE WHEN CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1)=0 THEN 0 ELSE CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) +1) END space_3_loc FROM ##NamesFromDifferentClients_1 |
콤마의 위치와 공백의 위치를 참고 하여 문자열을 파싱 한다.
SELECT ID ,ClientID ,SourceString ,CASE WHEN space_3_loc = 0 THEN second_string WHEN comma_loc < space_1_loc AND space_3_loc > 0 THEN second_string + ' ' + third_string ELSE third_string END FirstName ,LTRIM(CASE WHEN space_2_loc = 0 THEN '' WHEN space_3_loc = 0 THEN third_string ELSE fourth_string END) MiddleName ,first_string LastName ,CASE WHEN comma_loc > space_1_loc THEN second_string ELSE '' END Suffix FROM ( -- extract strings SELECT ID ,ClientID ,SourceString ,comma_loc ,space_1_loc ,space_2_loc ,space_3_loc -- Remove trailing comma from either first or second string ,REPLACE(LEFT(SourceString,space_1_loc-1) ,',','') first_string ,REPLACE( CASE WHEN space_2_loc > 0 THEN CAST( SUBSTRING(SourceString, space_1_loc+1,space_2_loc-space_1_loc) AS varchar(20) ) ELSE CAST( RIGHT(SourceString, DATALENGTH(SourceString)-space_1_loc) AS varchar(20) ) END, ',','') second_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN SUBSTRING(SourceString, space_2_loc+1,DATALENGTH(SourceString)) WHEN space_3_loc > 0 THEN CAST( SUBSTRING(SourceString, space_2_loc+1, space_3_loc-space_2_loc) AS varchar(30) ) END third_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN CAST('' AS varchar(30)) ELSE CAST( SUBSTRING(SourceString, space_3_loc, DATALENGTH(SourceString)) AS varchar(20)) END fourth_string FROM ( -- extract delimiter locations SELECT ID, ClientID, SourceString, CHARINDEX(',',SourceString,1) comma_loc, CHARINDEX(' ',SourceString,1) space_1_loc, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) space_2_loc, CASE WHEN CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1)=0 THEN 0 ELSE CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) +1) END space_3_loc FROM ##NamesFromDifferentClients_1 ) SourceAndDelimiters ) SourceAndExtractedStrings |
[이름의 길이가 다른고 다양한 포맷이 혼합된 경우]
일반적인 형식과 다른 이름 형식이거나 다양한 형식의 포맷이 섞여 있는 경우이다.
-- create temporary table for storing source name strings -- and their associated name parts IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL DROP TABLE ##NamesFromDifferentClients_1
CREATE TABLE ##NamesFromDifferentClients_1 ( ID BIGINT IDENTITY(1,1), ClientID varchar(20), SourceString varchar(75), FirstName varchar(20), MiddleName varchar(20), LastName varchar(30), Suffix varchar(5) )
INSERT INTO ##NamesFromDifferentClients_1 ( ClientID, SourceString ) VALUES ('B', 'Bits, Tim'), ('B', 'Jones, Ken dePaul'), ('B', 'Cats, Sally S'), ('B', 'Mountains JR, Mike George'), ('B', 'Mountains, Mary Anne Bits')
/* String formats first_string, second_string as LastName, FirstName first_string, second_string third_string as LastName, FirstName MiddleName first_string second_string, third_string fourth_string as LastName Suffix, FirstName MiddleName first_string, second_string third_string fourth_string as LastName, FirstName MiddleName */
SELECT ID ,ClientID ,SourceString ,CASE WHEN space_3_loc = 0 THEN second_string WHEN comma_loc < space_1_loc AND space_3_loc > 0 THEN second_string + ' ' + third_string ELSE third_string END FirstName ,LTRIM(CASE WHEN space_2_loc = 0 THEN '' WHEN space_3_loc = 0 THEN third_string ELSE fourth_string END) MiddleName ,first_string LastName ,CASE WHEN comma_loc > space_1_loc THEN second_string ELSE '' END Suffix FROM ( -- extract strings SELECT ID ,ClientID ,SourceString ,comma_loc ,space_1_loc ,space_2_loc ,space_3_loc -- Remove trailing comma from either first or second string ,REPLACE(LEFT(SourceString,space_1_loc-1) ,',','') first_string ,REPLACE( CASE WHEN space_2_loc > 0 THEN CAST( SUBSTRING(SourceString, space_1_loc+1,space_2_loc-space_1_loc) AS varchar(20) ) ELSE CAST( RIGHT(SourceString, DATALENGTH(SourceString)-space_1_loc) AS varchar(20) ) END, ',','') second_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN SUBSTRING(SourceString, space_2_loc+1,DATALENGTH(SourceString)) WHEN space_3_loc > 0 THEN CAST( SUBSTRING(SourceString, space_2_loc+1, space_3_loc-space_2_loc) AS varchar(30) ) END third_string ,CASE WHEN space_2_loc = 0 THEN CAST('' AS varchar(30)) WHEN space_3_loc = 0 THEN CAST('' AS varchar(30)) ELSE CAST( SUBSTRING(SourceString, space_3_loc, DATALENGTH(SourceString)) AS varchar(20)) END fourth_string FROM ( -- extract delimiter locations SELECT ID, ClientID, SourceString, CHARINDEX(',',SourceString,1) comma_loc, CHARINDEX(' ',SourceString,1) space_1_loc, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) space_2_loc, CASE WHEN CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1)=0 THEN 0 ELSE CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString, CHARINDEX(' ',SourceString,1)+1) +1) END space_3_loc FROM ##NamesFromDifferentClients_1 ) SourceAndDelimiters ) SourceAndExtractedStrings |
[참고자료]
http://www.mssqltips.com/sqlservertip/3283/name-parsing-for-result-sets-with-different-name-formats/
강성욱 / jevida@naver.com
Microsoft SQL Server MVP
Blog : http://sqlmvp.kr
Facebook : http://facebook.com/sqlmvp