다양한 포맷의 이름 파싱 하기

 

  • Version : SQL Server 2005, 2008, 2008R2, 2012, 2014

 

비즈니스 파트마다 다양한 형식의 이름(또는 전화번호) 세트를 사용한다. 이러한 경우 성과 이름을 어떻게 구분하여 정리 업무를 수행하는 사람이라면 한 번쯤 고민해보았을 것이다.

 

이번 시간에는 구분 할 수 있는 특정한 기호를 기준으로 이름을 파싱하여 사용하는 방법에 대해서 알아본다.

 

[공백으로 구분된 경우]

성과 이름 사이에 공백이 있는 포맷을 파싱하여 사용하는 방법이다. 실습용 테이블을 생성하고 데이터를 입력 한다.

-- create temporary table for storing source name strings

-- and their associated name parts

IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL

DROP TABLE ##NamesFromDifferentClients_1

 

CREATE TABLE ##NamesFromDifferentClients_1

(

ID BIGINT IDENTITY(1,1),

ClientID varchar(20),

SourceString varchar(75),

FirstName varchar(20),

MiddleName varchar(20),

LastName varchar(30),

Suffix varchar(5)

)

 

-- populate temporary table with source name strings

INSERT INTO ##NamesFromDifferentClients_1

(

ClientID,

SourceString

)

VALUES

('A', 'Tim Bits'),

('A', 'Ken dePaul Jones'),

('A', 'Sally S Cats'),

('A', 'Mike George Mountains JR')

 

다음 스크립트를 실행하면 공백을 기준으로 파싱할 문자열의 길이를 확인 할 수 있다.

SELECT

ID, ClientID, SourceString,

CHARINDEX(' ',SourceString,1) space_1_loc,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1) space_2_loc,

CASE

WHEN

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)=0

THEN 0

ELSE

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)

+1)

END space_3_loc

FROM ##NamesFromDifferentClients_1

 

 

파싱할 문자열의 길이에 따라 문자를 추출하고 공백을 제거하면 성과 이름을 구분한다.

-- map extracted strings to name parts

SELECT

ID

,ClientID

,SourceString

,first_string FirstName

,CASE

WHEN third_string = '' THEN CAST('' AS varchar(20))

ELSE CAST(second_string AS varchar(20))

END MiddleName

,CASE

WHEN third_string = '' THEN CAST(second_string AS varchar(30))

ELSE CAST(third_string AS varchar(30))

END LastName

-- Remove leading blank

,LTRIM(CASE

WHEN fourth_string = '' THEN CAST('' AS varchar(5))

ELSE CAST(fourth_string AS varchar(5))

END) Suffix

FROM

(

-- extract strings

SELECT

ID

,ClientID

,SourceString

,LEFT(SourceString,space_1_loc-1) first_string

,CASE

WHEN space_2_loc > 0

THEN

CAST(

SUBSTRING(SourceString,

space_1_loc+1,space_2_loc-space_1_loc)

AS varchar(20)

)

ELSE

CAST(

RIGHT(SourceString,

DATALENGTH(SourceString)-space_1_loc)

AS varchar(20)

)

END second_string

,CASE

WHEN space_2_loc = 0

THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0

THEN SUBSTRING(SourceString,space_2_loc+1,

DATALENGTH(SourceString))

WHEN space_3_loc > 0

THEN

CAST(

SUBSTRING(SourceString,

space_2_loc+1,

space_3_loc-space_2_loc)

AS varchar(30)

)

END third_string

,CASE

WHEN space_2_loc = 0

THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0

THEN CAST('' AS varchar(30))

ELSE

CAST(

SUBSTRING(SourceString,

space_3_loc,

DATALENGTH(SourceString))

AS varchar(5))

END fourth_string

FROM

(

-- extract delimiter locations

SELECT

ID, ClientID, SourceString,

CHARINDEX(' ',SourceString,1) space_1_loc,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1) space_2_loc,

CASE

WHEN

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)=0

THEN 0

ELSE

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)

+1)

END space_3_loc

FROM ##NamesFromDifferentClients_1

) SourceAndDelimiters

) SourceAndExtractedStrings

 

 

 

[콤마와 공백으로 구분된 경우]

성에는 콤마와 공백으로 구분되어 있고 중간 이름과 마지막 이름은 공백으로 구분되어 있는 경우이다. 실습용 테이블을 생성하고 데이터를 입력 한다.

-- create temporary table for storing source name strings

-- and their associated name parts

IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL

DROP TABLE ##NamesFromDifferentClients_1

 

CREATE TABLE ##NamesFromDifferentClients_1

(

ID BIGINT IDENTITY(1,1),

ClientID varchar(20),

SourceString varchar(75),

FirstName varchar(20),

MiddleName varchar(20),

LastName varchar(30),

Suffix varchar(5)

)

 

INSERT INTO ##NamesFromDifferentClients_1

(

ClientID,

SourceString

)

VALUES

('B', 'Bits, Tim'),

('B', 'Jones, Ken dePaul'),

('B', 'Cats, Sally S'),

('B', 'Mountains JR, Mike George'),

('B', 'Mountains, Mary Anne Bits')

 

다음 스크립트를 실행하면 콤마와 공백을 기준으로 파싱할 문자열의 길이를 확인 할 수 있다.

SELECT

ID, ClientID, SourceString,

CHARINDEX(',',SourceString,1) comma_loc,

CHARINDEX(' ',SourceString,1) space_1_loc,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1) space_2_loc,

CASE

WHEN

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)=0

THEN 0

ELSE

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)

+1)

END space_3_loc

FROM ##NamesFromDifferentClients_1

 

 

콤마의 위치와 공백의 위치를 참고 하여 문자열을 파싱 한다.

SELECT

ID

,ClientID

,SourceString

,CASE

WHEN space_3_loc = 0 THEN second_string

WHEN comma_loc < space_1_loc AND space_3_loc > 0

THEN second_string + ' ' + third_string

ELSE third_string

END FirstName

,LTRIM(CASE

WHEN space_2_loc = 0 THEN ''

WHEN space_3_loc = 0 THEN third_string

ELSE fourth_string

END) MiddleName

,first_string LastName

,CASE

WHEN comma_loc > space_1_loc THEN second_string

ELSE ''

END Suffix

FROM

(

-- extract strings

SELECT

ID

,ClientID

,SourceString

,comma_loc

,space_1_loc

,space_2_loc

,space_3_loc

-- Remove trailing comma from either first or second string

,REPLACE(LEFT(SourceString,space_1_loc-1)

,',','') first_string

,REPLACE(

CASE

WHEN space_2_loc > 0

THEN

CAST(

SUBSTRING(SourceString,

space_1_loc+1,space_2_loc-space_1_loc)

AS varchar(20)

)

ELSE

CAST(

RIGHT(SourceString,

DATALENGTH(SourceString)-space_1_loc)

AS varchar(20)

)

END,

',','') second_string

,CASE

WHEN space_2_loc = 0 THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0 THEN

SUBSTRING(SourceString,

space_2_loc+1,DATALENGTH(SourceString))

WHEN space_3_loc > 0 THEN

CAST(

SUBSTRING(SourceString,

space_2_loc+1,

space_3_loc-space_2_loc)

AS varchar(30)

)

END third_string

,CASE

WHEN space_2_loc = 0 THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0 THEN CAST('' AS varchar(30))

ELSE

CAST(

SUBSTRING(SourceString,

space_3_loc,

DATALENGTH(SourceString))

AS varchar(20))

END fourth_string

FROM

(

-- extract delimiter locations

SELECT

ID, ClientID, SourceString,

CHARINDEX(',',SourceString,1) comma_loc,

CHARINDEX(' ',SourceString,1) space_1_loc,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1) space_2_loc,

CASE

WHEN

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)=0

THEN 0

ELSE

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)

+1)

END space_3_loc

FROM ##NamesFromDifferentClients_1

) SourceAndDelimiters

) SourceAndExtractedStrings

 

 

 

[이름의 길이가 다른고 다양한 포맷이 혼합된 경우]

일반적인 형식과 다른 이름 형식이거나 다양한 형식의 포맷이 섞여 있는 경우이다.

-- create temporary table for storing source name strings

-- and their associated name parts

IF OBJECT_ID('tempdb..##NamesFromDifferentClients_1') IS NOT NULL

DROP TABLE ##NamesFromDifferentClients_1

 

CREATE TABLE ##NamesFromDifferentClients_1

(

ID BIGINT IDENTITY(1,1),

ClientID varchar(20),

SourceString varchar(75),

FirstName varchar(20),

MiddleName varchar(20),

LastName varchar(30),

Suffix varchar(5)

)

 

INSERT INTO ##NamesFromDifferentClients_1

(

ClientID,

SourceString

)

VALUES

('B', 'Bits, Tim'),

('B', 'Jones, Ken dePaul'),

('B', 'Cats, Sally S'),

('B', 'Mountains JR, Mike George'),

('B', 'Mountains, Mary Anne Bits')

 

/*

String formats

first_string, second_string

as LastName, FirstName

first_string, second_string third_string

as LastName, FirstName MiddleName

first_string second_string, third_string fourth_string

as LastName Suffix, FirstName MiddleName

first_string, second_string third_string fourth_string

as LastName, FirstName MiddleName

*/

 

SELECT

ID

,ClientID

,SourceString

,CASE

WHEN space_3_loc = 0 THEN second_string

WHEN comma_loc < space_1_loc AND space_3_loc > 0

THEN second_string + ' ' + third_string

ELSE third_string

END FirstName

,LTRIM(CASE

WHEN space_2_loc = 0 THEN ''

WHEN space_3_loc = 0 THEN third_string

ELSE fourth_string

END) MiddleName

,first_string LastName

,CASE

WHEN comma_loc > space_1_loc THEN second_string

ELSE ''

END Suffix

FROM

(

-- extract strings

SELECT

ID

,ClientID

,SourceString

,comma_loc

,space_1_loc

,space_2_loc

,space_3_loc

-- Remove trailing comma from either first or second string

,REPLACE(LEFT(SourceString,space_1_loc-1)

,',','') first_string

,REPLACE(

CASE

WHEN space_2_loc > 0

THEN

CAST(

SUBSTRING(SourceString,

space_1_loc+1,space_2_loc-space_1_loc)

AS varchar(20)

)

ELSE

CAST(

RIGHT(SourceString,

DATALENGTH(SourceString)-space_1_loc)

AS varchar(20)

)

END,

',','') second_string

,CASE

WHEN space_2_loc = 0 THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0 THEN

SUBSTRING(SourceString,

space_2_loc+1,DATALENGTH(SourceString))

WHEN space_3_loc > 0 THEN

CAST(

SUBSTRING(SourceString,

space_2_loc+1,

space_3_loc-space_2_loc)

AS varchar(30)

)

END third_string

,CASE

WHEN space_2_loc = 0 THEN CAST('' AS varchar(30))

WHEN space_3_loc = 0 THEN CAST('' AS varchar(30))

ELSE

CAST(

SUBSTRING(SourceString,

space_3_loc,

DATALENGTH(SourceString))

AS varchar(20))

END fourth_string

FROM

(

-- extract delimiter locations

SELECT

ID, ClientID, SourceString,

CHARINDEX(',',SourceString,1) comma_loc,

CHARINDEX(' ',SourceString,1) space_1_loc,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1) space_2_loc,

CASE

WHEN

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)=0

THEN 0

ELSE

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,

CHARINDEX(' ',SourceString,1)+1)

+1)

END space_3_loc

FROM ##NamesFromDifferentClients_1

) SourceAndDelimiters

) SourceAndExtractedStrings

 

 

 

 

 

[참고자료]

http://www.mssqltips.com/sqlservertip/3283/name-parsing-for-result-sets-with-different-name-formats/

 



강성욱 / jevida@naver.com
Microsoft SQL Server MVP
Blog : http://sqlmvp.kr
Facebook : http://facebook.com/sqlmvp





profile

강성욱 / jevida@naver.com
Microsoft SQL Server MVP
Blog : http://sqlmvp.kr
Facebook : http://facebook.com/sqlmvp

Kakao Talk : SQLMVP

Line : jevida


현재 LA에 거주하고 있으며 SQL에 관심있는 분이면 언제든 친추 환영합니다.