-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcms_kumc_mapping_dedup.sql
183 lines (164 loc) · 5.96 KB
/
cms_kumc_mapping_dedup.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
-- cms_kumc_mapping_dedup.sql: explore the duplicates in the data and remove them from consideration
-- Copyright (c) 2017 University of Kansas Medical Center
-- Run against identified GROUSE server.
-- 1. How many bene_id maps to multiple patient_nums
select count(patient_num) ct, bene_id from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS
from cms_id.cms_kumc_mapping
where patient_num is not null and bene_id is not null)
group by bene_id
having count(patient_num)>1;
-- 2. How many patient_num maps to multiple bene_ids
select count(bene_id) ct, patient_num from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS
from cms_id.cms_kumc_mapping
where patient_num is not null and bene_id is not null)
group by patient_num
having count(bene_id)>1;
-- 3. Duplicates caused by bene_ids in crosswalk that are missing in cms_id.bene_id_mapping
select count(*) from cms_id.cms_kumc_mapping
where bene_id is null and xw_bene_id is not null;
select count(*) ct, patient_num from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0 and
dups_missing_map = 0)
group by patient_num
having count(*) > 1;
-- De-duplication
-- 1. dups_pat_num will be set to the number of patient_nums the bene_id maps to
-- if it maps to more than one
MERGE
INTO cms_id.cms_kumc_mapping mp
USING (
select count(patient_num) ct, bene_id from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS
from cms_id.cms_kumc_mapping
where patient_num is not null and bene_id is not null)
group by bene_id
having count(patient_num)>1
) dup
ON (dup.bene_id = mp.bene_id)
WHEN MATCHED THEN UPDATE
SET mp.dups_pat_num = dup.ct;
select count(*) from cms_id.cms_kumc_mapping
where dups_pat_num<>0;
-- 2. dups_bene_id will be set to the number of bene_ids the patient_num maps to
-- if it maps more than one
MERGE
INTO cms_id.cms_kumc_mapping mp
USING (
select count(bene_id) ct, patient_num from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS
from cms_id.cms_kumc_mapping
where patient_num is not null and bene_id is not null)
group by patient_num
having count(bene_id)>1
) dup
ON (dup.patient_num = mp.patient_num)
WHEN MATCHED THEN UPDATE
SET mp.dups_bene_id = dup.ct;
-- 3. dups_missing_map will be set to the number of rows a patient_num maps to
-- when the issue is because bene_ids in crosswalk are missing in cms_id.bene_id_mapping
MERGE
INTO cms_id.cms_kumc_mapping mp
USING (
select count(*) ct, patient_num from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0)
group by patient_num
having count(*) > 1
) dup
ON (dup.patient_num = mp.patient_num)
WHEN MATCHED THEN UPDATE
SET mp.DUPS_MISSING_MAP = dup.ct;
-- ======== FINAL PATIENT MAPPING OF INTEREST
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, cms_dob_shift_months,
bh_date_shift_days, bh_dob_date_shift
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null));
-- ========= PATIENTS WHO ARE IN BOTH THE DATA SETS
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null))
and patient_num is not null and bene_id_deid is not null;
-- ========== NOW VERIFICATION OF THIS SET
-- Verifying some counts
select count(distinct patient_num) from cms_id.cms_kumc_mapping;
select count(distinct bene_id) from cms_id.cms_kumc_mapping;
select count(*) from cms_id.bene_id_mapping;
select count(distinct bene_id) from cms_id.bene_id_mapping;
-- Verification of de-duplication
select count(bene_id_deid), patient_num from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, BH_DATE_SHIFT_DAYS,
cms_dob_shift_months, BH_DOB_DATE_SHIFT
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0 and
((dups_missing_map =0) or (bene_id is not null and xw_bene_id is not null))
) group by patient_num
having count(bene_id_deid)>1;
select count(distinct patient_num)
from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, cms_dob_shift_months,
bh_date_shift_days, bh_dob_date_shift
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null))
);
-- This count should match the one below
select count(patient_num)
from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, cms_dob_shift_months,
bh_date_shift_days, bh_dob_date_shift
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null))
);
select count(distinct BENE_ID_DEID)
from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, cms_dob_shift_months,
bh_date_shift_days, bh_dob_date_shift
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null))
);
-- This count should match the one below
select count(BENE_ID_DEID)
from (
select distinct patient_num, bene_id, bene_id_deid,
cms_date_shift_days, cms_dob_shift_months,
bh_date_shift_days, bh_dob_date_shift
from cms_id.cms_kumc_mapping
where
dups_bene_id = 0 and
dups_pat_num = 0
and (dups_missing_map = 0 or (bene_id is not null and xw_bene_id is not null))
);