@article{897bccd3c13f45cba915f704c809ac75,
title = "Demonstrating an approach for evaluating synthetic geospatial and temporal epidemiologic data utility: results from analyzing >1.8 million SARS-CoV-2 tests in the United States National COVID Cohort Collaborative (N3C)",
abstract = "Objective: This study sought to evaluate whether synthetic data derived from a national coronavirus disease 2019 (COVID-19) dataset could be used for geospatial and temporal epidemic analyses. Materials and Methods: Using an original dataset (n = 1854968 severe acute respiratory syndrome coronavirus 2 tests) and its synthetic derivative, we compared key indicators of COVID-19 community spread through analysis of aggregate and zip code-level epidemic curves, patient characteristics and outcomes, distribution of tests by zip code, and indicator counts stratified by month and zip code. Similarity between the data was statistically and qualitatively evaluated. Results: In general, synthetic data closely matched original data for epidemic curves, patient characteristics, and outcomes. Synthetic data suppressed labels of zip codes with few total tests (mean = 2.9 ± 2.4; max = 16 tests; 66% reduction of unique zip codes). Epidemic curves and monthly indicator counts were similar between synthetic and original data in a random sample of the most tested (top 1%; n = 171) and for all unsuppressed zip codes (n = 5819), respectively. In small sample sizes, synthetic data utility was notably decreased. Discussion: Analyses on the population-level and of densely tested zip codes (which contained most of the data) were similar between original and synthetically derived datasets. Analyses of sparsely tested populations were less similar and had more data suppression. Conclusion: In general, synthetic data were successfully used to analyze geospatial and temporal trends. Analyses using small sample sizes or populations were limited, in part due to purposeful data label suppression - an attribute disclosure countermeasure. Users should consider data fitness for use in these cases.",
keywords = "COVID-19, data sharing, data utility, electronic health records, synthetic data",
author = "{The N3C Consortium} and Thomas, {Jason A.} and Foraker, {Randi E.} and Noa Zamstein and Morrow, {Jon D.} and Payne, {Philip R.O.} and Wilcox, {Adam B.} and Haendel, {Melissa A.} and Chute, {Christopher G.} and Gersing, {Kenneth R.} and Anita Walden and Haendel, {Melissa A.} and Bennett, {Tellen D.} and Chute, {Christopher G.} and Eichmann, {David A.} and Justin Guinney and Kibbe, {Warren A.} and Hongfang Liu and Payne, {Philip R.O.} and Pfaff, {Emily R.} and Robinson, {Peter N.} and Saltz, {Joel H.} and Heidi Spratt and Justin Starren and Christine Suver and Wilcox, {Adam B.} and Williams, {Andrew E.} and Chunlei Wu and Chute, {Christopher G.} and Pfaff, {Emily R.} and Davera Gabriel and Hong, {Stephanie S.} and Kristin Kostka and Lehmann, {Harold P.} and Moffitt, {Richard A.} and Michele Morris and Palchuk, {Matvey B.} and Zhang, {Xiaohan Tanner} and Zhu, {Richard L.} and Pfaff, {Emily R.} and Benjamin Amor and Bissell, {Mark M.} and Marshall Clark and Girvin, {Andrew T.} and Hong, {Stephanie S.} and Kristin Kostka and Lee, {Adam M.} and Miller, {Robert T.} and Michele Morris and Palchuk, {Matvey B.} and Walters, {Kellie M.} and Anita Walden and Yooree Chae and Connor Cook and Alexandra Dest and Dietz, {Racquel R.} and Thomas Dillon and Francis, {Patricia A.} and Rafael Fuentes and Alexis Graves and McMurry, {Julie A.} and Neumann, {Andrew J.} and O'Neil, {Shawn T.} and Usman Sheikh and Volz, {Andr{\'e}a M.} and Elizabeth Zampino and Austin, {Christopher P.} and Gersing, {Kenneth R.} and Samuel Bozzette and Mariam Deacy and Nicole Garbarini and Kurilla, {Michael G.} and Michael, {Sam G.} and Rutter, {Joni L.} and Meredith Temple-O'connor and Benjamin Amor and Bissell, {Mark M.} and Bradwell, {Katie Rebecca} and Girvin, {Andrew T.} and Amin Manna and Nabeel Qureshi and Saltz, {Mary Morrison} and Christine Suver and Chute, {Christopher G.} and Haendel, {Melissa A.} and McMurry, {Julie A.} and Volz, {Andr{\'e}a M.} and Anita Walden and Carolyn Bramante and Harper, {Jeremy Richard} and Wenndy Hernandez and Koraishy, {Farrukh M.} and Federico Mariona and Saidulu Mattapally and Amit Saha and Satyanarayana Vedula and Yujuan Fu and Nisha Mathews and Ofer Mendelevitch",
note = "Publisher Copyright: {\textcopyright} 2022 The Author(s). Published by Oxford University Press on behalf of the American Medical Informatics Association. All rights reserved.",
year = "2022",
month = aug,
day = "1",
doi = "10.1093/jamia/ocac045",
language = "English",
volume = "29",
pages = "1350--1365",
journal = "Journal of the American Medical Informatics Association",
issn = "1067-5027",
number = "8",
}