<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.0 20120330//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article"><?properties manuscript?><front><journal-meta><journal-id journal-id-type="nlm-journal-id">100970413</journal-id><journal-id journal-id-type="pubmed-jr-id">22289</journal-id><journal-id journal-id-type="nlm-ta">J Biomed Inform</journal-id><journal-id journal-id-type="iso-abbrev">J Biomed Inform</journal-id><journal-title-group><journal-title>Journal of biomedical informatics</journal-title></journal-title-group><issn pub-type="ppub">1532-0464</issn><issn pub-type="epub">1532-0480</issn></journal-meta><article-meta><article-id pub-id-type="pmid">25533437</article-id><article-id pub-id-type="pmc">4355087</article-id><article-id pub-id-type="doi">10.1016/j.jbi.2014.12.005</article-id><article-id pub-id-type="manuscript">NIHMS651282</article-id><article-categories><subj-group subj-group-type="heading"><subject>Article</subject></subj-group></article-categories><title-group><article-title>Sparse modeling of spatial environmental variables associated with asthma</article-title></title-group><contrib-group><contrib contrib-type="author"><name><surname>Chang</surname><given-names>Timothy S.</given-names></name><xref ref-type="aff" rid="A1">1</xref><email>tschang3@uwalumni.com</email></contrib><contrib contrib-type="author"><name><surname>Gangnon</surname><given-names>Ronald E.</given-names></name><xref ref-type="aff" rid="A2">2</xref><email>ronald@biostat.wisc.edu</email></contrib><contrib contrib-type="author"><name><surname>Page</surname><given-names>C. David</given-names></name><xref ref-type="aff" rid="A3">3</xref><email>page@biostat.wisc.edu</email></contrib><contrib contrib-type="author"><name><surname>Buckingham</surname><given-names>William R.</given-names></name><xref ref-type="aff" rid="A4">4</xref><email>wrbuckin@wisc.edu</email></contrib><contrib contrib-type="author"><name><surname>Tandias</surname><given-names>Aman</given-names></name><xref ref-type="aff" rid="A5">5</xref><xref rid="FN2" ref-type="author-notes">I</xref></contrib><contrib contrib-type="author"><name><surname>Cowan</surname><given-names>Kelly J.</given-names></name><xref ref-type="aff" rid="A6">6</xref><xref rid="FN3" ref-type="author-notes">II</xref></contrib><contrib contrib-type="author"><name><surname>Tomasallo</surname><given-names>Carrie D.</given-names></name><xref ref-type="aff" rid="A7">7</xref><email>carrie.tomasallo@dhs.wisconsin.gov</email></contrib><contrib contrib-type="author"><name><surname>Arndt</surname><given-names>Brian G.</given-names></name><xref ref-type="aff" rid="A8">8</xref><email>brian.arndt@fammed.wisc.edu</email></contrib><contrib contrib-type="author"><name><surname>Hanrahan</surname><given-names>Lawrence P.</given-names></name><xref ref-type="aff" rid="A9">9</xref><email>larry.hanrahan@fammed.wisc.edu</email></contrib><contrib contrib-type="author"><name><surname>Guilbert</surname><given-names>Theresa W.</given-names></name><xref ref-type="aff" rid="A10">10</xref><xref rid="FN4" ref-type="author-notes">III</xref></contrib></contrib-group><aff id="A1"><label>1</label>Department of Biostatistics and Medical Informatics, School of Medicine and Public Health, University of Wisconsin, 5795 Medical Sciences Center, 1300 University Ave, Madison, Wisconsin 53706, USA</aff><aff id="A2"><label>2</label>Department of Biostatistics and Medical Informatics, School of Medicine and Public Health, University of Wisconsin, 603 Warf Office Building, 610 Walnut St, Madison, Wisconsin 53706, USA</aff><aff id="A3"><label>3</label>Department of Biostatistics and Medical Informatics, School of Medicine and Public Health, University of Wisconsin, 6743 Medical Sciences Center, 1300 University Ave, Madison, Wisconsin 53706, USA</aff><aff id="A4"><label>4</label>Applied Population Laboratory, Department of Rural Sociology, University of Wisconsin, 308b Agricultural Hall, 1450 Linden Dr, Madison, Wisconsin 53706, USA</aff><aff id="A5"><label>5</label>Department of Family Medicine, School of Medicine and Public Health, University of Wisconsin, 1100 Delaplaine Ct, Madison, Wisconsin 53715, USA</aff><aff id="A6"><label>6</label>Department of Pediatrics, School of Medicine and Public Health, University of Wisconsin, Madison, Wisconsin, USA</aff><aff id="A7"><label>7</label>Division of Public Health, Bureau of Environmental and Occupational Health, Wisconsin Department of Health Services, Room 150, 1 West Wilson Street, Madison, Wisconsin 53703, USA</aff><aff id="A8"><label>8</label>Department of Family Medicine, School of Medicine and Public Health, University of Wisconsin, 1100 Delaplaine Ct, Madison, Wisconsin 53715, USA</aff><aff id="A9"><label>9</label>Department of Population Health Sciences, School of Medicine and Public Health, University of Wisconsin, 1100 Delaplaine Ct, Madison, Wisconsin 53715, USA</aff><aff id="A10"><label>10</label>Department of Pediatrics, School of Medicine and Public Health, University of Wisconsin-Madison, Madison, Wisconsin, USA</aff><author-notes><corresp id="FN1">Corresponding Author: Timothy S. Chang, MD, PhD, Department of Biostatics and Medical Informatics, 5795 Medical Sciences Center, 1300 University Ave, Madison, WI 53706, Phone: +1 608 265 5693, Fax: +1 608 265 7916, <email>tschang3@uwalumni.com</email></corresp><fn id="FN2" fn-type="present-address"><label>I</label><p>Present: Division of Public Health, Bureau of Environmental and Occupational Health, Wisconsin Department of Health Services, Room 150, 1 West Wilson Street, Madison, Wisconsin 53703, USA, <email>aman.tandias@dhs.wisconsin.gov</email></p></fn><fn id="FN3" fn-type="present-address"><label>II</label><p>Present: Department of Pediatrics, College of Medicine, University of Vermont, 111 Colchester Avenue, Burlington, Vermont 05401, USA, <email>kelly.cowan@vtmednet.org</email></p></fn><fn id="FN4" fn-type="present-address"><label>III</label><p>Present: Department of Pediatrics, Cincinnati Children&#x02019;s Hospital Medical Center, MLC 2021, 3333 Burnet Avenue, Cincinnati, Ohio 45229, USA, <email>theresa.guilbert@cchmc.org</email></p></fn></author-notes><pub-date pub-type="nihms-submitted"><day>28</day><month>12</month><year>2014</year></pub-date><pub-date pub-type="epub"><day>20</day><month>12</month><year>2014</year></pub-date><pub-date pub-type="ppub"><month>2</month><year>2015</year></pub-date><pub-date pub-type="pmc-release"><day>01</day><month>2</month><year>2016</year></pub-date><volume>53</volume><fpage>320</fpage><lpage>329</lpage><!--elocation-id from pubmed: 10.1016/j.jbi.2014.12.005--><permissions><copyright-statement>&#x000a9; 2014 Elsevier Inc. All rights reserved.</copyright-statement><copyright-year>2014</copyright-year></permissions><abstract><p id="P2">Geographically distributed environmental factors influence the burden of diseases such as asthma. Our objective was to identify sparse environmental variables associated with asthma diagnosis gathered from a large electronic health record (EHR) dataset while controlling for spatial variation. An EHR dataset from the University of Wisconsin&#x02019;s Family Medicine, Internal Medicine and Pediatrics Departments was obtained for 199,220 patients aged 5&#x02013;50 years over a three-year period. Each patient&#x02019;s home address was geocoded to one of 3,456 geographic census block groups. Over one thousand block group variables were obtained from a commercial database. We developed a Sparse Spatial Environmental Analysis (SASEA). Using this method, the environmental variables were first dimensionally reduced with sparse principal component analysis. Logistic thin plate regression spline modeling was then used to identify block group variables associated with asthma from sparse principal components. The addresses of patients from the EHR dataset were distributed throughout the majority of Wisconsin&#x02019;s geography. Logistic thin plate regression spline modeling captured spatial variation of asthma. Four sparse principal components identified via model selection consisted of food at home, dog ownership, household size, and disposable income variables. In rural areas, dog ownership and renter occupied housing units from significant sparse principal components were associated with asthma. Our main contribution is the incorporation of sparsity in spatial modeling. SASEA sequentially added sparse principal components to Logistic thin plate regression spline modeling. This method allowed association of geographically distributed environmental factors with asthma using EHR and environmental datasets. SASEA can be applied to other diseases with environmental risk factors.</p></abstract><kwd-group><kwd>asthma</kwd><kwd>sparsity</kwd><kwd>spatial statistics</kwd><kwd>environmental variables</kwd><kwd>electronic health record</kwd></kwd-group></article-meta></front><body><sec sec-type="intro" id="S1"><title>INTRODUCTION</title><p id="P3">While there is continued interest in associating genes with disease using methods such as genome-wide association studies [<xref rid="R1" ref-type="bibr">1</xref>], approximately 23% of disease burden and death can be attributed to environmental factors [<xref rid="R2" ref-type="bibr">2</xref>]. It is important to associate diseases with a strong environmental component, including respiratory infections, cardiovascular disease, cerebrovascular disease, and asthma [<xref rid="R2" ref-type="bibr">2</xref>], with geographical environmental factors. Methods that consider spatial variation and interpretability of results will increasingly be utilized as clinical, environmental, and geographical datasets become more readily available. Our paper applies sparsity with spatial modeling to study the association of environmental factors and asthma.</p><sec id="S2"><title>1.1 Asthma risk factors</title><p id="P4">Asthma is a chronic respiratory disease with variable and recurring symptoms, airflow obstruction, bronchial hyperresponsiveness, and inflammation [<xref rid="R3" ref-type="bibr">3</xref>]. Its prevalence rose by 15% in the last 10 years [<xref rid="R4" ref-type="bibr">4</xref>]. Based on a Wisconsin Department of Health Services asthma surveillance report, approximately 14% of adults and 10% of children have been diagnosed with asthma in Wisconsin [<xref rid="R5" ref-type="bibr">5</xref>]. In 2009, 5,300 people were hospitalized and 21,000 went to an emergency department with a principal diagnosis of asthma. Eleven percent of adults with asthma had an emergency department visit and 20% had urgent care visits for symptoms [<xref rid="R5" ref-type="bibr">5</xref>].</p><p id="P5">Asthma onset is associated with multiple, complex factors. While some are non-modifiable such as sex and age [<xref rid="R6" ref-type="bibr">6</xref>], many others are associated with the environment and residential location. These include educational attainment, household income, health insurance, smoking, physical activity, and obesity [<xref rid="R6" ref-type="bibr">6</xref>]. Medical conditions influenced by the environment and associated with asthma include atopy [<xref rid="R7" ref-type="bibr">7</xref>], allergic reactions [<xref rid="R8" ref-type="bibr">8</xref>], airway hyperreactivity [<xref rid="R9" ref-type="bibr">9</xref>], and airway responsiveness [<xref rid="R10" ref-type="bibr">10</xref>]. Over 370 outdoor and indoor environmental factors have been associated with asthma including substances from building materials, cleaning products, personal care products, central heating systems, maintenance, and humidification devices [<xref rid="R11" ref-type="bibr">11</xref>].</p></sec><sec id="S3" sec-type="methods"><title>1.2 Geographical analysis of asthma</title><p id="P6">Geographic information system (GIS) analyses have been used to study geographic environmental variables associated with asthma. The most studied variable was air pollution [<xref rid="R12" ref-type="bibr">12</xref>], which has been measured via passive measurement, direct measurement, proximity to roadways, and traffic carbon emissions. Besides air pollution, asthma was associated with climate differences [<xref rid="R13" ref-type="bibr">13</xref>], latitude [<xref rid="R14" ref-type="bibr">14</xref>], and socioeconomic status [<xref rid="R15" ref-type="bibr">15</xref>]. Socioeconomic status, specifically male employment, was positively associated with asthma in a Southern California study, where access to care and the hygiene hypothesis&#x02014;the idea that limited exposure to bacterial and viral pathogens during childhood result in a predisposition to allergy [<xref rid="R16" ref-type="bibr">16</xref>,<xref rid="R17" ref-type="bibr">17</xref>]&#x02014;were proposed as explanations.</p><p id="P7">Fewer asthma studies have incorporated local environmental variables aggregated at the level of census tracts or block groups. Census tracts and block groups are geographic areas developed by the United State Census Bureau and contain 1,500&#x02013;8,000 and 600&#x02013;3,000 people, respectively. Using census tract data, asthma diagnosis was correlated with houses facing highway intersection [<xref rid="R18" ref-type="bibr">18</xref>] and sociodemographic characteristics of race, sex, and education [<xref rid="R19" ref-type="bibr">19</xref>]. Fewer studies have used block group level variables. Socioeconomic status was associated with asthma diagnosis using block group level data [<xref rid="R15" ref-type="bibr">15</xref>]. Many of these analyses used questionnaire data to determine asthma diagnosis, which may be limited by self-report bias [<xref rid="R20" ref-type="bibr">20</xref>]. These analyses involved less than 5700 participants, 10 environmental variables, and census geographic regions from only a portion of a state.</p></sec><sec id="S4" sec-type="methods"><title>1.3 Environmental variables associated with EHR data</title><p id="P8">Environmental variables and built environments have been studied using EHR data. For example, nitrogen oxides were tested for association with diseases including asthma diagnoses obtained from EHR datasets in primary care [<xref rid="R21" ref-type="bibr">21</xref>]. Body mass index (BMI) calculated from EHR data was positively associated with the number of fast food restaurants near a person&#x02019;s home [<xref rid="R22" ref-type="bibr">22</xref>].</p><p id="P9">Schwartz et al. [<xref rid="R23" ref-type="bibr">23</xref>] used an EHR dataset, environmental community-level variables, and multilevel statistical analysis to demonstrate that lower BMI was associated with higher socioeconomic status and areas with more venues for physical activity.</p></sec><sec id="S5" sec-type="methods"><title>1.4 Spatial Statistics to Study Disease</title><p id="P10">Spatial statistics offer methods to incorporate geographic location to identify risk factors associated with disease [<xref rid="R24" ref-type="bibr">24</xref>]. The spatial statistics utilized in this study included a generalized additive model. Generalized additive models [<xref rid="R25" ref-type="bibr">25</xref>] are generalized linear models with predictors that involve a linear sum of smooth functions.</p><p id="P11">Previous health studies that utilized spatial generalized additive models investigated the association of air pollution and mortality, tuberculosis drug resistance patterns in Peru [<xref rid="R26" ref-type="bibr">26</xref>], and geographic distribution of heart disease [<xref rid="R27" ref-type="bibr">27</xref>].</p><p id="P12">Spatial statistics, specifically additive models, have been combined with sparsity. COSSO [<xref rid="R28" ref-type="bibr">28</xref>] and SpAM [<xref rid="R29" ref-type="bibr">29</xref>] extended the lasso estimator [<xref rid="R30" ref-type="bibr">30</xref>] while another approach created a new sparsity-smoothness penalty [<xref rid="R31" ref-type="bibr">31</xref>].</p></sec><sec id="S6"><title>1.4 Objective</title><p id="P13">Our goal was to identify an interpretable set of environmental risk factors of asthma distributed geographically. Other studies have combined environmental variables and EHR data, spatial statistics and disease, and spatial statistics and sparsity. Our main contribution is the addition of sparsity to spatial statistics. As applied to geographically distributed EHR and environmental datasets, we describe this methodology as <underline>S</underline>p<underline>a</underline>rse <underline>S</underline>patial <underline>E</underline>nvironmental <underline>A</underline>nalysis (SASEA).</p></sec></sec><sec sec-type="materials|methods" id="S7"><title>MATERIAL AND METHODS</title><sec id="S8"><title>2.1 Source of Clinical Data</title><p id="P14">Our research group developed the <underline>U</underline>niversity of <underline>W</underline>isconsin <underline>E</underline>lectronic <underline>Health</underline> Record - <underline>P</underline>ublic <underline>H</underline>ealth <underline>In</underline>formation <underline>Ex</underline>change (UW eHealth-PHINEX), an EHR data exchange between University of Wisconsin (UW) Departments of Family Medicine, Internal Medicine, and Pediatrics and the Wisconsin Division of Public Health. Further details have been described previously [<xref rid="R32" ref-type="bibr">32</xref>]. Briefly, the database contains clinical care variables such as disease diagnoses, medications, and laboratory test results. Patient home addresses from year 2012 were geocoded to year 2000 block groups, the smallest geographic area the US Census Bureau publishes. Block groups were linked to detailed demographic and environmental data from the ESRI Business Analyst database [<xref rid="R33" ref-type="bibr">33</xref>]. The data exchange is a HIPAA Privacy Rule compliant-limited dataset, and the Wisconsin Division of Public Health is blinded to patient/provider specific information. All patient identifiers were removed from the data except birth month and year, ZIP code, and census block group of the patient&#x02019;s address. Random accession numbers were used for patients, primary care providers, and clinics. This study was approved by the UW Institutional Review Board protocol M2009-1273 and UW Health with data use agreements.</p><p id="P15">UW Departments of Family Medicine, Internal Medicine and Pediatrics provide care in 42 clinics throughout Wisconsin, but most are located in southcentral Wisconsin. Patients represent various environmental and socioeconomic strata in rural and urban regions.</p><p id="P16">The dataset study period was from 2007&#x02013;2009. Patients were identified as asthma cases when an asthma ICD-9 code of 493.xx was associated with a Current Procedural Terminology (CPT) codes for hospital discharges (CPT codes 99238 and 99239) or office visits (CPT codes 99201&#x02013;99205 and 99211&#x02013;99215). Patients were identified as controls if they did not have a hospital discharge or office visit associated with an asthma ICD-9 code over the study period, but were seen at least once in the UW Departments of Family Medicine, Internal Medicine, or Pediatrics. Participants in the study were restricted to be 5 to 50 years of age. There were no additional exclusion criteria.</p><p id="P17">This study included 199,220 participants [<xref rid="R32" ref-type="bibr">32</xref>]. There were 103,690 patients living in 2,186 block groups with sufficient data also linked with ESRI data to perform the analysis described in section 2.3.</p><p id="P18">The ESRI Business Analyst environmental database [<xref rid="R33" ref-type="bibr">33</xref>] consisted of 1,117 variables, which included demographics (age, income, education), living conditions (household members, rental property, pets, rural living), behaviors (food consumption, transportation, smoking, television), health (drug prescriptions), and businesses (types of employees and employers). Most variables (992 of the 1,117) represented data from year 2010 while the remaining variables represented data from the year 2000 (please see <xref rid="SD1" ref-type="supplementary-material">Appendix Table 1</xref>). Variables were normalized to the number of participants or number of households when appropriate and standardized to 
<inline-graphic xlink:href="nihms651282ig1.jpg"/>(0,1).</p></sec><sec id="S9"><title>2.2 Spatial Variation of Asthma</title><p id="P19">The large-scale spatial variation of asthma was estimated using a Logistic generalized additive model with a thin plate regression spline smoothing term [<xref rid="R34" ref-type="bibr">34</xref>], which we refer to as a Logistic thin plate regression spline model. As described in the Introduction, generalized additive models [<xref rid="R25" ref-type="bibr">25</xref>] are generalized linear models with predictors that involve a linear sum of smooth functions. Smooth functions allow a more flexible model specification that can account for the spatial location of variables. A thin plate regression spline is considered an optimal smooth function as it was developed for optimal smoothness and data fitting using a more computationally feasible low rank approximation [<xref rid="R34" ref-type="bibr">34</xref>]. Thin plate regression splines do not require user-specified locations of knots and are multivariate, penalized low rank approximations of a smooth function with optimal data fitting and smoothness [<xref rid="R34" ref-type="bibr">34</xref>]. Tensor product smooths were not used as both longitude and latitude were scaled similarly. The geographic area of Wisconsin was small and did not necessitate pseudosplines on a sphere [<xref rid="R35" ref-type="bibr">35</xref>]. The thin plate regression spline was represented by a bivariate smooth term with the longitude and latitude of the block group centroid.</p><p id="P20">ArcGIS software [<xref rid="R36" ref-type="bibr">36</xref>] was used to map the total number of patients, prevalence, and Logistic thin plate regression spline modeling predicted prevalence per block group. Block groups with &#x02264; 20 total participants (asthmatic and non-asthmatic) were mapped with a different coloring scheme than block groups with &#x0003e; 20 total participants.</p></sec><sec id="S10"><title>2.3 Association of environmental variables with asthma</title><p id="P21">The Logistic thin plate regression spline model with covariates was: 
<disp-formula id="FD1"><label>(1)</label><mml:math id="M1" display="block" overflow="scroll"><mml:mo>log</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi mathvariant="italic">asthma</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b1;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:msub><mml:mi mathvariant="italic">block</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">age</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">sex</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">race</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">ethnicity</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">BMI</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>6</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">encounter</mml:mi><mml:mspace width="0.16667em"/><mml:mi mathvariant="italic">days</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>7</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">distance</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:math></disp-formula> where <italic>i</italic> is a participant and <italic>j</italic> is the block group participant <italic>i</italic>&#x02019;s home address is geocoded to. The thin plate regression spline is 
<inline-formula><mml:math id="M2" overflow="scroll"><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>q</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>c</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:msub><mml:mi mathvariant="bold-italic">&#x003b6;</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> where <italic>c<sub>k</sub></italic>(<italic>x<sub>i</sub></italic>, <italic>y<sub>i</sub></italic>) is the <italic>k</italic><sup>th</sup> basis function, <bold><italic>&#x003b6;</italic></bold><italic><sub>k</sub></italic> is an unknown parameter, and <italic>x<sub>i</sub></italic> and <italic>y<sub>i</sub></italic> are the latitude and longitude for the centroid of the block group participant&#x02019;s geocoded home address. <italic>&#x003b1;<sub>j</sub>block<sub>j</sub></italic> is the block group random effect allowing for hierarchical0 structuring of the model. The basis dimension, q, was chosen to be 80, which was twice the estimated degrees of freedom to allow for appropriate smoothness. BMI was the body mass index at first encounter. The encounter days covariate was defined as the number of days between a patient&#x02019;s first and last encounter in the EHR dataset. Encounter days controlled for the differences between patients who utilized the University of Wisconsin&#x02019;s hospitals and clinics over a short amount of time (e.g., those who had one visit to the emergency department) versus patients who utilized the hospitals and clinics over a longer amount of time (e.g., those who received the majority of their medical care at the University of Wisconsin). The distance covariate was defined as the Euclidean distance between a patient&#x02019;s home address and the address of the primary care office with the most frequent visits.</p><p id="P54">An adapted Logistic generalized additive model fitting with subsampling for smoothing spline fitting was used to accommodate the large dataset [<xref rid="R37" ref-type="bibr">37</xref>,<xref rid="R38" ref-type="bibr">38</xref>]. Subsampling was a technique used for faster computation and did not cause parameter estimate variability. The smoothing splines were first set using a subsample of the data. In each subsequent step of the penalized iteratively re-weighted least squares (PIRLS) algorithm, the weighted model matrix was constructed in blocks with the corresponding QR decomposition so as not to form the entire model matrix. This method is justified for restricted maximum likelihood estimation because of asymptotic multivariate normality of Q&#x02019;z, where z is the pseudodata. This adapted method was previously implemented in the R package <italic>mgcv</italic> using the <italic>bam</italic> function with <italic>tp</italic> parameter [<xref rid="R34" ref-type="bibr">34</xref>].</p><p id="P55">The 1,117 environmental variables from ESRI were dimensionally reduced using sparse principal component analysis (SPCA) [<xref rid="R39" ref-type="bibr">39</xref>] before testing for association with asthma. SPCA is in contrast to principal component analysis (PCA). In PCA, the principal components are a linear combination of the original variables. SPCA uses only a small number of non-zero weighted original variables to create each principal component. By having a small number of the original variables constitute each principal component, we can more easily discuss groupings of variables. The simplest SPCA implementation first identifies principal components with traditional PCA. Each principal component can then be regressed using the original variables with a lasso penalty. We chose twenty as the number of non-zero variables to be included for each sparse principal component for ease of interpretability. The SPCA algorithm determined which environmental variables were chosen. We utilized the <italic>spca</italic> function in the <italic>elasticnet</italic> package from R [<xref rid="R39" ref-type="bibr">39</xref>].</p><p id="P56">The sparse principal components were used to determine how environmental variables were associated with asthma. Starting with the first sparse principal component, which represented the greatest variance of the ESRI dataset, sparse principal components were added sequentially to the Logistic thin plate regression spline model with covariates as shown below.</p><p id="P22">
<disp-formula id="FD2"><label>(2)</label><mml:math id="M3" display="block" overflow="scroll"><mml:mo>log</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi mathvariant="italic">asthma</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mi>f</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b1;</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:msub><mml:mi mathvariant="italic">block</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">age</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">sex</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>3</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">race</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:msub><mml:mi mathvariant="italic">ethnicity</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>5</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">BMI</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>6</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">encounter</mml:mi><mml:mspace width="0.16667em"/><mml:mi mathvariant="italic">days</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003b2;</mml:mi><mml:mn>7</mml:mn></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">distance</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>i</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>r</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>&#x003b4;</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">SPC</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></disp-formula> where <italic>r</italic> = {1, &#x02026;, 18} and (<italic>SPC</italic>)<italic><sub>j</sub></italic><sub>,</sub><italic><sub>m</sub></italic> is the value of sparse principal component <italic>m</italic> at block group <italic>j</italic>. The largest model tested included the thin plate regression spline, covariates, and sparse principal components one through eighteen. Bayesian Information Criterion (BIC) was used to compare models without sparse principal components and with <italic>r</italic> = {1, &#x02026;, 18}. Eighteen was chosen as the maximum number of sparse principal components we would be willing to investigate, as interpretability of environmental variables was a major goal. As models with increasing parameters can have a greater likelihood, BIC is a score used in model selection that penalizes the likelihood by the number of parameters. <italic>BIC</italic> = &#x02212;2 * ln(<italic>L</italic>) + <italic>k</italic> * ln(<italic>N</italic>), where <italic>L</italic> is the likelihood, <italic>k</italic> is the number of parameters estimated and <italic>N</italic> is the number of observations [<xref rid="R40" ref-type="bibr">40</xref>]. The model with the lowest BIC is optimal.</p><p id="P23">We summarize the number of variables used in modeling. There are 1,117 environmental variables. Using sparse principal components analysis, 20 environmental variables were selected to represent each sparse principal component (SPC). By using SPCA, SPCs were ranked by importance based on the variance each SPCs represented from the original environmental variable dataset. To determine which SPCs to add to the model, we added the SPCs in order from rank #1 to rank #18. For example, we tested if the model was best fit if SPC 1 was added; if SPC 1 and 2 were added; if SPC 1, 2, and 3 were added; etc&#x02026;; and if SPC 1 through 18 were added. The model also included 6 non-environmental covariates to control for variables that likely affect asthma diagnosis.</p><p id="P24">The change in log odds of asthma diagnosis per unit measure of sparse principal component <italic>m</italic>, <italic>&#x003b4;<sub>m</sub></italic>(<italic>SPC</italic>)<italic><sub>j</sub></italic><sub>,</sub><italic><sub>m</sub></italic>, was examined for each Wisconsin block group <italic>j</italic>. As 
<inline-formula><mml:math id="M4" overflow="scroll"><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi mathvariant="italic">SPC</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>j</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mn>20</mml:mn></mml:msubsup><mml:mrow><mml:msub><mml:mi>&#x003b7;</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>E</mml:mi><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> where <italic>EV</italic> is an environmental variable from the ESRI database, the associated effect on the change in log odds of asthma diagnosis for an individual environmental variable could be assessed via the sign of <italic>&#x003b7;<sub>n</sub></italic> and <italic>&#x003b7;<sub>m</sub></italic>. All statistical analyses were performed in R.[<xref rid="R34" ref-type="bibr">34</xref>,<xref rid="R41" ref-type="bibr">41</xref>]</p><p id="P57">The graphical abstract summarizes the SASEA methods integrated in this study. We began with electronic health record data (covariates, asthma diagnosis as defined above, and the block group participants resided in) and environmental variables from Esri (values represent measurements from a block group). We applied sparse principal component analysis to the environmental variables. We combined the EHR dataset with the sparse principal components from the environmental variable dataset. We ran a logistic thin plate regression spline model on this combined dataset. Bayesian information criterion was used to select the number of sparse principal components added to the model. The odds ratios for variables in the logistic regression model were reported. The change in log odds value was color coded and mapped to block groups.</p></sec></sec><sec sec-type="results" id="S11"><title>RESULTS</title><p id="P25"><xref rid="F1" ref-type="fig">Figure 1a</xref> shows major cities and population by county in Wisconsin. <xref rid="F1" ref-type="fig">Figure 1b</xref> shows the total number of patients from the EHR dataset per block group. The majority of patients were in Dane County, WI and eight southern counties. Most participants were near the four, more urban cities including Madison, Eau Claire, Wausau, and Appleton. The median and maximum number of participants per block was 5 and 2,673, respectively. 927 out of 3,307 block groups had greater than 20 total participants. The asthma period prevalence from 2007&#x02013;2009 was 8.4% (16,739 out of 199,220).</p><p id="P26"><xref rid="F2" ref-type="fig">Figure 2</xref> shows asthma prevalence and the Logistic thin plate regression spline model predicted prevalence for each block group using only a coordinate bivariate smooth term, log(<italic>asthma<sub>i</sub></italic><sub>,</sub><italic><sub>j</sub></italic>) = <italic>f</italic>(<italic>x<sub>i</sub></italic>, <italic>y<sub>i</sub></italic>) + <italic>&#x003b5;<sub>i</sub></italic><sub>,</sub><italic><sub>j</sub></italic>. The median and maximum prevalence estimates were 0% and 100%, which was expected as many block groups had a low total number of participants (<xref rid="F3" ref-type="fig">Figure 3a</xref>). However, the regression model was intended to smooth prevalence and decrease extreme values (<xref rid="F3" ref-type="fig">Figure 3b</xref>). The predicted prevalence had a minimum, median, and maximum prevalence of 2.3%, 6.8%, and 12%. Spatially, higher prevalence was modeled in the urban southcentral, rural southwestern, and central regions of the state. Lower prevalence was modeled in rural areas of the state.</p><p id="P27">The Logistic thin plate regression spline model with covariates had the lowest BIC when four sparse principal components were added to the model (56,511) compared with the model containing no sparse principal components (63,974) or 2&#x02013;3 and 5&#x02013;18 sparse principal components (56,528&#x02013;56,581). The four sparse principal components accounted for 0.9%, 0.7%, 0.5% and 0.2% of the variance from the original dataset. The odds ratios of asthma diagnosis for covariates and the four sparse principal components are shown in <xref rid="T1" ref-type="table">Table 1</xref>. Race had the greatest effect size. The odds of asthma diagnosis for black participants were highest at 1.78 (1.63&#x02013;1.94) compared with the odds of asthma diagnosis for white participants. The odds of asthma diagnosis for Asian participants were lowest at 0.66 (0.57&#x02013;0.77) compared with the odds of asthma diagnosis for white participants. Hispanic ethnicity compared to non-Hispanic ethnicity and age per 10 years had a moderate decrease in the odds ratio of asthma diagnosis. Sex, encounter days, and distance to clinic had no or smaller effect size on asthma diagnosis odds ratio. Of the sparse principal components, sparse principal components 2 with an odds ratio of 0.95 (0.89&#x02013;0.99) and 4 with an odds ratio of 1.13 (1.01&#x02013;1.27) were significant. The range of data values for sparse principal components 1, 2, 3, and 4 was 30.9, 14.4, 21.1, and 10.6, respectively.</p><p id="P28"><xref rid="T2" ref-type="table">Table 2</xref> shows representative, high loading environmental variables of the four sparse principal components. Variable loadings and model coefficients are shown as well. Variables of significant sparse principal components with positive loadings and positive model coefficients, including households with disposable income less than $15,000, were positively associated with asthma. Variables of significant sparse principal components with negative loadings and negative model coefficients, including renter occupied housing units, were positively associated with asthma. Variables of significant sparse principal components with positive loadings and negative model coefficients, including dog ownership, were negatively associated with asthma (please see <xref rid="SD1" ref-type="supplementary-material">Appendix Table 2</xref> for all variables and loadings of these four sparse principal components).</p><p id="P29">The change in log odds of asthma diagnosis per unit measure of sparse principal components 2 and 4 are shown in <xref rid="F3" ref-type="fig">Figure 3</xref>. The change in log odds was calculated for sparse principal component <italic>m</italic>, Wisconsin block group <italic>j</italic>, and model coefficient <italic>&#x003b4;</italic> as <italic>&#x003b4;<sub>m</sub></italic>(<italic>SPC</italic>)<italic><sub>j</sub></italic><sub>,</sub><italic><sub>m</sub></italic>. The urban areas of Wisconsin include Madison, Milwaukee, Eau Claire, La Crosse, and Appleton, whose locations are shown in <xref rid="F2" ref-type="fig">Figure 2a</xref>. For sparse principal component 2, rural areas of the state had a positive change in log odds of asthma diagnosis (<xref rid="F3" ref-type="fig">Figure 3a</xref>). The two southern urban areas with a negative change in log odds included Madison and Milwaukee. As the SPC loading for dog ownership was positive and the model coefficient of SPC 2, <italic>&#x003b4;</italic><sub>2</sub>, was negative (<xref rid="T2" ref-type="table">Table 2</xref>), less dog ownership contributed to the positive change in log odds of asthma diagnosis in rural areas. As the SPC loading for renter occupied housing units was negative (<xref rid="T2" ref-type="table">Table 2</xref>), more renter occupied housing units contributed to the positive change in log odds of asthma diagnosis in rural areas. For sparse principal component 4, eastern areas of the state had a positive change in log odds of asthma diagnosis (<xref rid="F3" ref-type="fig">Figure 3b</xref>). As the SPC loading for households with a disposable income less than $15,000 was positive and the model coefficient of SPC 4 <italic>&#x003b4;</italic><sub>4</sub>, was positive (<xref rid="T2" ref-type="table">Table 2</xref>), more households with a disposable income less than $15,000 contributed to the positive change in log odds of asthma diagnosis in eastern Wisconsin.</p></sec><sec sec-type="discussion" id="S12"><title>DISCUSSION</title><p id="P30">It is estimated that the lack of medical care accounts for 10 percent of early deaths in the United States. The remaining determinants of health contributing to early deaths include genetics, social circumstances, environmental exposure, and behavioral patterns [<xref rid="R42" ref-type="bibr">42</xref>]. Our work utilizing SASEA is unique in the application of sparsity to spatial statistics. We use of a large EHR dataset to identify sparse environmental variables associated with asthma. This methodology was able to identify several location-specific, environmental risk factors associated with asthma. Specifically, less dog ownership and more renter occupied housing units were associated with increased asthma in rural areas. More households with low disposable income were associated with increased asthma in eastern Wisconsin.</p><sec id="S13"><title>4.1 SASEA</title><p id="P31">We attempted to account for multiple comparisons of the many variables and identify a smaller set of interpretable risk factors. The SASEA method performs sparse principal component analysis outside of the regression model as a means to prevent overfitting. Twenty non-zero loading variables for each sparse principal component were chosen to consider small groups of variables. Sparse principal components were sequentially added using BIC for model selection given the greater variance represented by higher ranked components. The sequential addition allowed for further structured and sparse variable evaluation. Although a set of sparse principal components were selected by BIC (four in this study), only some may be significant based on the odds ratio (two in this study). This feature of SASEA enhances sparsity as well.</p><p id="P32">The integrations of various scalable methods accommodated analysis of the EHR, environmental, and geographical datasets. Use of adaptable statistical model fitting based on well-studied algorithms was an asset that allowed for simple extension to the large number of patients and variables.</p></sec><sec id="S14"><title>4.2 Community variables associated with asthma</title><p id="P33">Similarly to our study, two additional studies [<xref rid="R14" ref-type="bibr">14</xref>,<xref rid="R15" ref-type="bibr">15</xref>] investigated community environmental variables associated with asthma. In our study, asthma was defined based on EHRs compared to survey data in the other two studies [<xref rid="R14" ref-type="bibr">14</xref>,<xref rid="R15" ref-type="bibr">15</xref>]. Many variables overlapped among these three studies. Our study and Krsti&#x00107;&#x02019;s study [<xref rid="R14" ref-type="bibr">14</xref>] used latitude and longitude. We did not use insolation, air temperature or air pollution. Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>] and our study had the individual variables of age, race, gender, and BMI. Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>] included more individual variables including freeway distance while our study included more community environmental variables. We did not have male unemployment, which Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>] found significantly associated with asthma. However we had other variables similar to socioeconomic status such as disposable income and employed civilian population in sparse principal components.</p><p id="P34">For analyses, Krsti&#x00107; [<xref rid="R14" ref-type="bibr">14</xref>] used linear regression, Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>] used multilevel logistic random effect modeling, and our study used logistic thin plate regression spline modeling. The random effect modeling likely was more applicable to Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>] as communities were concentrated. In our study the random effect in addition to thin plate regression spline based on latitude and longitude was chosen because of the distribution of patients throughout the state of Wisconsin.</p></sec><sec id="S15"><title>4.3 Sparse principal components associated with asthma</title><p id="P35">As seen in other studies [<xref rid="R6" ref-type="bibr">6</xref>,<xref rid="R40" ref-type="bibr">40</xref>], higher asthma prevalence was associated with increased BMI, female sex, and black race, while lower asthma prevalence was associated with Hispanic ethnicity. Age, encounter days in the EHR dataset, and distance to most frequented clinic had little association with asthma diagnosis. Sparse principal component 2 represented by dog ownership and renter occupied housing units in addition to sparse principal component 4 represented by disposable income less than $15,000 were significantly associated with asthma. The individual variables representing sparse principal components likely contributed a small effect size.</p><p id="P36">Previous studies support the association of asthma and the environmental variables representing the sparse principal components in this study. In this study, dog ownership had a negative association with asthma. Other studies have shown perinatal and early life exposure to dog allergen was associated with reduced allergy and asthma risk later in life [<xref rid="R43" ref-type="bibr">43</xref>,<xref rid="R44" ref-type="bibr">44</xref>]. Renter occupied housing units were positively associated with asthma in a Brazil study [<xref rid="R45" ref-type="bibr">45</xref>]. Rental housing was associated with cold and damp housing, which in turn were associated with increased asthma [<xref rid="R46" ref-type="bibr">46</xref>]. Lastly, lower socioeconomic status as reflected by disposable income less than $15,000 was associated with greater asthma. Previously mentioned studies came to similar conclusion [<xref rid="R5" ref-type="bibr">5</xref>,<xref rid="R6" ref-type="bibr">6</xref>]. However, these results contradicted the positive association of socioeconomic status with asthma found in the Shankardass et al. [<xref rid="R15" ref-type="bibr">15</xref>]. Thus, the SASEA method used in this study identified variables that were previously associated with asthma risk, suggesting that these methods may have a role to studying chronic disease.</p><p id="P37">Mapping the associated change in log odds of asthma for a sparse principal component highlighted the geographic distribution of these sparse principal components and high loading environmental variables. The urban and rural discrepancy seen in differences in renter occupied housing units may be driven by the built environment, the human-made space where people live and work [<xref rid="R47" ref-type="bibr">47</xref>].</p></sec><sec id="S16" sec-type="methods"><title>4.4 EHR as a measure of clinical data</title><p id="P38">The use of EHR and block group characteristics merits comparison with traditional forms of health surveys including self-report and public health measured data. Canadian studies suggested census aggregate-level measures of income and education did not approximate individual level measures well [<xref rid="R48" ref-type="bibr">48</xref>&#x02013;<xref rid="R51" ref-type="bibr">51</xref>]. There was similarity between self-reported variables and clinically measured variables. Self-reported colon cancer screening was similar to EHR imputed data [<xref rid="R52" ref-type="bibr">52</xref>]. Public health measured data were similar to EHR measured data. For example, BMI-based childhood obesity was 18% in both an EHR dataset and the National Health and Nutrition Evaluation Survey [<xref rid="R53" ref-type="bibr">53</xref>].</p><p id="P39">Agreement between disease prevalence based on health surveys and disease prevalence based on EHR datasets varies depending on disease. EHR datasets had prevalence similar to that from surveys for test-based conditions (e.g. diabetes) and decreased prevalence for minor conditions (e.g. back pain, headache, skin conditions) [<xref rid="R54" ref-type="bibr">54</xref>&#x02013;<xref rid="R56" ref-type="bibr">56</xref>]. Specifically, two Spanish studies showed that the asthma prevalence calculated from an EHR dataset was lower compared with asthma prevalence calculated from population surveys [<xref rid="R54" ref-type="bibr">54</xref>,<xref rid="R55" ref-type="bibr">55</xref>]. However asthma prevalence based on UW eHealth-PHINEX (8.4%) was similar to the Wisconsin health survey, Behavioral Risk Factor Surveillance System (8.0%) [<xref rid="R20" ref-type="bibr">20</xref>]. As there is no single lab test for diagnosis of asthma, ICD-9 codes likely under-identify asthma when compared with &#x0201c;gold standard&#x0201d; manual record review [<xref rid="R57" ref-type="bibr">57</xref>] but may be more objective compared with population surveys.</p></sec></sec><sec sec-type="conclusions" id="S17"><title>CONCLUSIONS</title><sec id="S18" sec-type="methods"><title>5.1 Future work and alternative methods</title><p id="P40">Further analysis to determine the individual variables from sparse principal components that are associated with asthma could be performed using traditional methods such as stepwise model selection with BIC. This analysis could be performed with UW eHealth-PHINEX data from other years (e.g. 2009&#x02013;2012), a UW eHealth-PHINEX hold out dataset, or a non-UW eHealth-PHINEX EHR dataset in another geographic region.</p><p id="P41">There are many future directions for this work regarding diseases and methods. Our methods could be applied to asthma control, other chronic diseases, and different communities. The census block groups and ESRI environmental data are already available nationwide. It is foreseeable that with the integration of a national EHR dataset, this type of analysis will be utilized to identify spatial risk factors to allow investigation or evaluation of interventions in any geographic region [<xref rid="R58" ref-type="bibr">58</xref>].</p><p id="P42">Alternative methods could have been used in this study. Traditional Logistic regression without the smoothing term does not account for the unknown orientation of spatial correlation among asthma due to geography, nor does it directly address difficulties in high dimensional data by constructing sparse models. Other spatial models included conditional auto-regressive models <sup>[<xref rid="R59" ref-type="bibr">59</xref>]</sup>. As the four sparse principal components accounted for a small percentage of variance from the original dataset, other methods such as traditional principal components analysis or clustering could have been utilized. However, traditional principal component analysis maintained all variables in each principal component preventing sparse interpretation, and clustering environmental variables added complexity. Few variables could have been associated more directly with the Logistic thin plate regression spline model using least absolute shrinkage and selector operator [<xref rid="R30" ref-type="bibr">30</xref>] such as COSSO [<xref rid="R28" ref-type="bibr">28</xref>]. However, a new set of variables would be identified for different diseases and variables could not be grouped. Allowing regression coefficients to vary over space as in geographically weighted regression [<xref rid="R60" ref-type="bibr">60</xref>] could be accomplished with spatial smoothing spline interaction terms.</p></sec><sec id="S19"><title>5.2 Limitations</title><p id="P43">There were limitations to the study. Although measures were taken to prevent overfitting and accommodate high dimensionality, this was an ecological, data-mining study without <italic>a priori</italic> variable hypotheses. This additive non-linear model likely does not fully capture the complexity of environmental factors influencing asthma.</p><p id="P44">The associations noted in the study may be due to confounding factors. One must be cognizant of ecological bias, because results about groups of people do not necessarily translate to the same findings about individuals. However, the neighborhood in which an individual lives in has been associated with health outcomes [<xref rid="R61" ref-type="bibr">61</xref>].</p><p id="P45">Multiple studies have shown the importance of EHR disease phenotype definitions, algorithm development, and validation [<xref rid="R62" ref-type="bibr">62</xref>&#x02013;<xref rid="R64" ref-type="bibr">64</xref>]. In this study, asthma cases were defined based on ICD-9 codes. Some have argued this may under-estimate asthma prevalence [<xref rid="R57" ref-type="bibr">57</xref>]. Aside from this study&#x02019;s EHR asthma phenotype definition, which is similar to the validated definition of Gershon et al. [<xref rid="R65" ref-type="bibr">65</xref>], other definitions such as the Healthcare Effectiveness Data and Information Set [<xref rid="R66" ref-type="bibr">66</xref>], have not been validated. We are currently validating alternative EHR phenotype definitions, which will also be used to segment asthma severity.</p><p id="P46">The results may be biased as UW eHealth-PHINEX data is not a complete representation of all block groups or persons in the state of Wisconsin. UW Family Medicine, Internal Medicine, and Pediatrics departments are an integrated health care system, but patients can receive care in at least two other major systems in the same catchment area. Because many hospitals and clinics have, or will soon have, an EHR system, sharing data through a statewide information exchange could mitigate this issue.</p><p id="P47">Another potential limitation is that the analysis included data elements from different years. While the EHR dataset represented years 2007&#x02013;2009, the patient addresses were from the date of EHR data extraction in year 2012. However, compared with other states, Wisconsin residents tend to move less frequently. Wisconsin is the fifth &#x0201c;stickiest&#x0201d; state, with 68.6% of the current residents having been born in Wisconsin, an indicator of decreased residential mobility [<xref rid="R67" ref-type="bibr">67</xref>]. Patient addresses were geocoded to year 2000 block groups to match the ESRI database. ESRI database variables were mostly from year 2010, while some census variables were from year 2000. There is minimal change in block group from year to year and the goal was to identify general trends of larger geographic areas. The ESRI year 2010 variables were closest to the EHR database dates and the census year 2010 variables were not yet available.</p><p id="P48">Our main contribution is the incorporation of sparsity in spatial modeling. The sequential addition of sparse principal components to Logistic thin plate regression allowed interpretable analysis of geographically distributed EHR and environmental datasets. Understanding spatial disease variation and environmental risk factors using methods such as SASEA can allow better explanation of geographical disease disparity.</p></sec></sec><sec sec-type="supplementary-material" id="S21"><title>Supplementary Material</title><supplementary-material content-type="local-data" id="SD1"><label>supplement</label><media xlink:href="NIHMS651282-supplement.docx" orientation="portrait" xlink:type="simple" id="d36e1494" position="anchor"/></supplementary-material></sec></body><back><ack id="S22"><p>We thank Michael Coen for his insightful comments during methodology discussions.</p><p><bold>FUNDING</bold></p><p>This study was supported by the Clinical and Translational Science Award program, previously through the National Center for Research Resources grant 1UL1RR025011, and now by the National Center for Advancing Translational Sciences grant 9U54TR000021. This investigation was also supported by the NIH T32 GM008692, the National Heart Lung and Blood Institute Fellowship F30HL112491, and the Wisconsin Division of Public Health from the Center for Disease Control and Prevention through the Wisconsin Environmental Public Health Tracking grant 1U38EH000951-01 and Public Health Improvement Initiative 5U58CD001316-02.</p></ack><fn-group><fn id="FN5"><p>The content is solely the responsibility of the authors and does not necessarily represent the official views of the NIH or the CDC.</p></fn><fn id="FN6"><p content-type="publisher-disclaimer">This is a PDF file of an unedited manuscript that has been accepted for publication. As a service to our customers we are providing this early version of the manuscript. The manuscript will undergo copyediting, typesetting, and review of the resulting proof before it is published in its final citable form. Please note that during the production process errors may be discovered which could affect the content, and all legal disclaimers that apply to the journal pertain.</p></fn></fn-group><glossary id="GL"><title>Abbreviations</title><def-list><def-item><term id="G1">UW eHealth-PHINEX</term><def><p>University of Wisconsin Electronic Health Record &#x02013; Public Health Information Exchange</p></def></def-item><def-item><term id="G2">EHR</term><def><p>electronic health record</p></def></def-item></def-list></glossary><ref-list><ref id="R1"><label>1</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Manolio</surname><given-names>TA</given-names></name></person-group><article-title>Genomewide association studies and assessment of the risk of disease</article-title><source>N Engl J Med</source><year>2010</year><volume>363</volume><fpage>166</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1056/NEJMra0905980</pub-id><pub-id pub-id-type="pmid">20647212</pub-id></element-citation></ref><ref id="R2"><label>2</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pr&#x000fc;ss-&#x000dc;st&#x000fc;n</surname><given-names>A</given-names></name><name><surname>Corval&#x000e1;n</surname><given-names>C</given-names></name></person-group><source>Preventing disease through healthy environments: Towards an estimate of the environmental burden of disease</source><publisher-name>World Health Organization</publisher-name><year>2006</year></element-citation></ref><ref id="R3"><label>3</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Busse</surname><given-names>WW</given-names></name><name><surname>Boushey</surname><given-names>HA</given-names></name><name><surname>Camargo</surname><given-names>CA</given-names></name><etal/></person-group><source>National Asthma Education and Prevention Program: Expert Panel Report 3: Guidelines for the Diagnosis and Management of Asthma, Summary Report 2007</source><publisher-loc>Bethesda, MD</publisher-loc><publisher-name>National Institutes of Health; National Heart, Lung, and Blood Institute</publisher-name><year>2007</year></element-citation></ref><ref id="R4"><label>4</label><element-citation publication-type="web"><collab>Centers for Disease Control and Prevention</collab><source>Asthma&#x02019;s Impact on the Nation: Data From the CDC National Asthma Control Program</source><comment><ext-link ext-link-type="uri" xlink:href="http://www.cdc.gov/asthma/impacts_nation/AsthmaFactSheet.pdf">http://www.cdc.gov/asthma/impacts_nation/AsthmaFactSheet.pdf</ext-link></comment><date-in-citation>accessed 21 May2013</date-in-citation></element-citation></ref><ref id="R5"><label>5</label><element-citation publication-type="web"><collab>Wisconsin Department of Health Services, Division of Public Health, Bureau of Environmental and Occupational Health</collab><source>Burden of Asthma in Wisconsin 2010</source><year>2012</year><comment><ext-link ext-link-type="uri" xlink:href="http://www.dhs.wisconsin.gov/eh/asthma/pdf/BurdenofAsthma2010Web.pdf">http://www.dhs.wisconsin.gov/eh/asthma/pdf/BurdenofAsthma2010Web.pdf</ext-link></comment><date-in-citation>accessed 21 May2013</date-in-citation></element-citation></ref><ref id="R6"><label>6</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zahran</surname><given-names>HS</given-names></name><name><surname>Bailey</surname><given-names>C</given-names></name></person-group><article-title>Factors associated with asthma prevalence among racial and ethnic groups-United States, 2009&#x02013;2010 Behavioral Risk Factor Surveillance System</article-title><source>J Asthma</source><comment>Published Online First: 11 April 2013</comment><pub-id pub-id-type="doi">10.3109/02770903.2013.794238</pub-id></element-citation></ref><ref id="R7"><label>7</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Arbes</surname><given-names>SJ</given-names><suffix>Jr</suffix></name><name><surname>Gergen</surname><given-names>PJ</given-names></name><name><surname>Vaughn</surname><given-names>B</given-names></name><etal/></person-group><article-title>Asthma cases attributable to atopy: results from the Third National Health and Nutrition Examination Survey</article-title><source>J Allergy Clin Immunol</source><year>2007</year><volume>120</volume><fpage>1139</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1016/j.jaci.2007.07.056</pub-id><pub-id pub-id-type="pmid">17889931</pub-id></element-citation></ref><ref id="R8"><label>8</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Torrent</surname><given-names>M</given-names></name><name><surname>Sunyer</surname><given-names>J</given-names></name><name><surname>Garcia</surname><given-names>R</given-names></name><etal/></person-group><article-title>Early-life allergen exposure and atopy, asthma, and wheeze up to 6 years of age</article-title><source>Am J Respir Crit Care Med</source><year>2007</year><volume>176</volume><fpage>446</fpage><lpage>53</lpage><pub-id pub-id-type="doi">10.1164/rccm.200607-916OC</pub-id><pub-id pub-id-type="pmid">17575100</pub-id></element-citation></ref><ref id="R9"><label>9</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Porsbjerg</surname><given-names>C</given-names></name><name><surname>von Linstow</surname><given-names>M-L</given-names></name><name><surname>Ulrik</surname><given-names>CS</given-names></name><etal/></person-group><article-title>Risk factors for onset of asthma: a 12-year prospective follow-up study</article-title><source>Chest</source><year>2006</year><volume>129</volume><fpage>309</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1378/chest.129.2.309</pub-id><pub-id pub-id-type="pmid">16478846</pub-id></element-citation></ref><ref id="R10"><label>10</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jackson</surname><given-names>DJ</given-names></name><name><surname>Evans</surname><given-names>MD</given-names></name><name><surname>Gangnon</surname><given-names>RE</given-names></name><etal/></person-group><article-title>Evidence for a causal relationship between allergic sensitization and rhinovirus wheezing in early life</article-title><source>Am J Respir Crit Care Med</source><year>2012</year><volume>185</volume><fpage>281</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1164/rccm.201104-0660OC</pub-id><pub-id pub-id-type="pmid">21960534</pub-id></element-citation></ref><ref id="R11"><label>11</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Perkins+Will</surname></name></person-group><source>Healthy Environments: A Compilation of Substances Linked to Asthma</source><publisher-loc>New York, NY</publisher-loc><year>2012</year><comment><ext-link ext-link-type="uri" xlink:href="http://transparency.perkinswill.com/assets/whitepapers/NIH_AsthmaReport_2012.pdf">http://transparency.perkinswill.com/assets/whitepapers/NIH_AsthmaReport_2012.pdf</ext-link></comment><date-in-citation>accessed 8 May2013</date-in-citation></element-citation></ref><ref id="R12"><label>12</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Patel</surname><given-names>MM</given-names></name><name><surname>Miller</surname><given-names>RL</given-names></name></person-group><article-title>Air pollution and childhood asthma: recent advances and future directions</article-title><source>Curr Opin Pediatr</source><year>2009</year><volume>21</volume><fpage>235</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1097/MOP.0b013e3283267726</pub-id><pub-id pub-id-type="pmid">19663041</pub-id></element-citation></ref><ref id="R13"><label>13</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hales</surname><given-names>S</given-names></name><name><surname>Lewis</surname><given-names>S</given-names></name><name><surname>Slater</surname><given-names>T</given-names></name><etal/></person-group><article-title>Prevalence of adult asthma symptoms in relation to climate in New Zealand</article-title><source>Environ Health Perspect</source><year>1998</year><volume>106</volume><fpage>607</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1289/ehp.98106607</pub-id><pub-id pub-id-type="pmid">9722625</pub-id></element-citation></ref><ref id="R14"><label>14</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Krsti&#x00107;</surname><given-names>G</given-names></name></person-group><article-title>Asthma prevalence associated with geographical latitude and regional insolation in the United States of America and Australia</article-title><source>PLoS ONE</source><year>2011</year><volume>6</volume><fpage>e18492</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0018492</pub-id><pub-id pub-id-type="pmid">21494627</pub-id></element-citation></ref><ref id="R15"><label>15</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shankardass</surname><given-names>K</given-names></name><name><surname>McConnell</surname><given-names>RS</given-names></name><name><surname>Milam</surname><given-names>J</given-names></name><etal/></person-group><article-title>The association between contextual socioeconomic factors and prevalent asthma in a cohort of Southern California school children</article-title><source>Soc Sci Med</source><year>2007</year><volume>65</volume><fpage>1792</fpage><lpage>806</lpage><pub-id pub-id-type="doi">10.1016/j.socscimed.2007.05.048</pub-id><pub-id pub-id-type="pmid">17658674</pub-id></element-citation></ref><ref id="R16"><label>16</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Strachan</surname><given-names>DP</given-names></name></person-group><article-title>Hay fever, hygiene, and household size</article-title><source>BMJ</source><year>1989</year><volume>299</volume><fpage>1259</fpage><lpage>60</lpage><pub-id pub-id-type="pmid">2513902</pub-id></element-citation></ref><ref id="R17"><label>17</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yazdanbakhsh</surname><given-names>M</given-names></name><name><surname>Kremsner</surname><given-names>PG</given-names></name><name><surname>van Ree</surname><given-names>R</given-names></name></person-group><article-title>Allergy, Parasites, and the Hygiene Hypothesis</article-title><source>Science</source><year>2002</year><volume>296</volume><fpage>490</fpage><lpage>4</lpage><pub-id pub-id-type="doi">10.1126/science.296.5567.490</pub-id><pub-id pub-id-type="pmid">11964470</pub-id></element-citation></ref><ref id="R18"><label>18</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Juhn</surname><given-names>YJ</given-names></name><name><surname>Qin</surname><given-names>R</given-names></name><name><surname>Urm</surname><given-names>S</given-names></name><etal/></person-group><article-title>The influence of neighborhood environment on the incidence of childhood asthma: a propensity score approach</article-title><source>J Allergy Clin Immunol</source><year>2010</year><volume>125</volume><fpage>838</fpage><lpage>843</lpage><fpage>e2</fpage><pub-id pub-id-type="doi">10.1016/j.jaci.2009.12.998</pub-id><pub-id pub-id-type="pmid">20236695</pub-id></element-citation></ref><ref id="R19"><label>19</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Holt</surname><given-names>EW</given-names></name><name><surname>Theall</surname><given-names>KP</given-names></name><name><surname>Rabito</surname><given-names>FA</given-names></name></person-group><article-title>Individual, Housing, and Neighborhood Correlates of Asthma among Young Urban Children</article-title><source>Journal of Urban Health</source><year>2013</year><volume>90</volume><fpage>1116</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1007/s11524-012-9709-3</pub-id></element-citation></ref><ref id="R20"><label>20</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tomasallo</surname><given-names>C</given-names></name><name><surname>Hanrahan</surname><given-names>LP</given-names></name><name><surname>Arndt</surname><given-names>B</given-names></name><etal/></person-group><article-title>Estimating Wisconsin Asthma Prevalence Using Clinical Electronic Health Records and Public Health Data. Forthcoming</article-title><source>Am J Public Health</source><year>2013</year></element-citation></ref><ref id="R21"><label>21</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kelly</surname><given-names>F</given-names></name><name><surname>Armstrong</surname><given-names>B</given-names></name><name><surname>Atkinson</surname><given-names>R</given-names></name><etal/></person-group><article-title>The London low emission zone baseline study</article-title><source>Res Rep Health Eff Inst</source><year>2011</year><fpage>3</fpage><lpage>79</lpage><pub-id pub-id-type="pmid">22315924</pub-id></element-citation></ref><ref id="R22"><label>22</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jilcott</surname><given-names>SB</given-names></name><name><surname>Wade</surname><given-names>S</given-names></name><name><surname>McGuirt</surname><given-names>JT</given-names></name><etal/></person-group><article-title>The association between the food environment and weight status among eastern North Carolina youth</article-title><source>Public Health Nutr</source><year>2011</year><volume>14</volume><fpage>1610</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1017/S1368980011000668</pub-id><pub-id pub-id-type="pmid">21486525</pub-id></element-citation></ref><ref id="R23"><label>23</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schwartz</surname><given-names>BS</given-names></name><name><surname>Stewart</surname><given-names>WF</given-names></name><name><surname>Godby</surname><given-names>S</given-names></name><etal/></person-group><article-title>Body mass index and the built and social environments in children and adolescents using electronic health records</article-title><source>Am J Prev Med</source><year>2011</year><volume>41</volume><fpage>e17</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2011.06.038</pub-id><pub-id pub-id-type="pmid">21961475</pub-id></element-citation></ref><ref id="R24"><label>24</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Waller</surname><given-names>LA</given-names></name><name><surname>Gotway</surname><given-names>CA</given-names></name></person-group><source>Applied spatial statistics for public health data</source><edition>1</edition><publisher-loc>New York, NY</publisher-loc><publisher-name>Wiley-Interscience</publisher-name><year>2004</year></element-citation></ref><ref id="R25"><label>25</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hastie</surname><given-names>T</given-names></name><name><surname>Tibshirani</surname><given-names>R</given-names></name></person-group><article-title>Generalized additive models</article-title><source>Statistical Science</source><year>1986</year><volume>1</volume><fpage>297</fpage><lpage>310</lpage><comment><ext-link ext-link-type="uri" xlink:href="dx.doi.org/10.1214/ss/1177013604">dx.doi.org/10.1214/ss/1177013604</ext-link></comment></element-citation></ref><ref id="R26"><label>26</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname><given-names>H-H</given-names></name><name><surname>Shin</surname><given-names>SS</given-names></name><name><surname>Contreras</surname><given-names>C</given-names></name><etal/></person-group><article-title>Use of spatial information to predict multidrug resistance in tuberculosis patients, Peru</article-title><source>Emerging Infect Dis</source><year>2012</year><volume>18</volume><fpage>811</fpage><lpage>3</lpage><pub-id pub-id-type="doi">10.3201/eid1805.111467</pub-id><pub-id pub-id-type="pmid">22516236</pub-id></element-citation></ref><ref id="R27"><label>27</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chaix</surname><given-names>B</given-names></name><name><surname>Rosvall</surname><given-names>M</given-names></name><name><surname>Lynch</surname><given-names>J</given-names></name><etal/></person-group><article-title>Disentangling contextual effects on cause-specific mortality in a longitudinal 23-year follow-up study: impact of population density or socioeconomic environment?</article-title><source>Int J Epidemiol</source><year>2006</year><volume>35</volume><fpage>633</fpage><lpage>43</lpage><pub-id pub-id-type="doi">10.1093/ije/dyl009</pub-id><pub-id pub-id-type="pmid">16452106</pub-id></element-citation></ref><ref id="R28"><label>28</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname><given-names>Y</given-names></name><name><surname>Zhang</surname><given-names>HH</given-names></name></person-group><article-title>Component selection and smoothing in multivariate nonparametric regression</article-title><source>The Annals of Statistics</source><year>2006</year><volume>34</volume><fpage>2272</fpage><lpage>97</lpage><pub-id pub-id-type="doi">10.1214/009053606000000722</pub-id></element-citation></ref><ref id="R29"><label>29</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ravikumar</surname><given-names>P</given-names></name><name><surname>Lafferty</surname><given-names>J</given-names></name><name><surname>Liu</surname><given-names>H</given-names></name><etal/></person-group><article-title>Sparse additive models</article-title><source>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</source><year>2009</year><volume>71</volume><fpage>1009</fpage><lpage>30</lpage><pub-id pub-id-type="doi">10.1111/j.1467-9868.2009.00718.x</pub-id></element-citation></ref><ref id="R30"><label>30</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tibshirani</surname><given-names>R</given-names></name></person-group><article-title>Regression shrinkage and selection via the lasso</article-title><source>Journal of the Royal Statistical Society Series B (Methodological)</source><year>1996</year><volume>58</volume><fpage>267</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.1111/j.1467-9868.2011.00771.x</pub-id></element-citation></ref><ref id="R31"><label>31</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Meier</surname><given-names>L</given-names></name><name><surname>Van de Geer</surname><given-names>S</given-names></name><name><surname>B&#x000fc;hlmann</surname><given-names>P</given-names></name></person-group><article-title>High-dimensional additive modeling</article-title><source>The Annals of Statistics</source><year>2009</year><volume>37</volume><fpage>3779</fpage><lpage>821</lpage><pub-id pub-id-type="doi">10.1214/09-AOS692</pub-id></element-citation></ref><ref id="R32"><label>32</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guilbert</surname><given-names>TW</given-names></name><name><surname>Arndt</surname><given-names>B</given-names></name><name><surname>Temte</surname><given-names>J</given-names></name><etal/></person-group><article-title>The Theory and Application of UW eHealth-PHINEX, A Clinical Electronic Health Record&#x02013;Public Health Information Exchange</article-title><source>Wisconsin Medical Journal</source><year>2012</year><volume>111</volume><fpage>124</fpage><lpage>33</lpage><pub-id pub-id-type="pmid">22870558</pub-id></element-citation></ref><ref id="R33"><label>33</label><element-citation publication-type="book"><collab>Esri</collab><source>Esri Business Analyst Desktop Premium</source><publisher-loc>Redlands, CA</publisher-loc><publisher-name>Environmental Systems Research Institute</publisher-name><year>2010</year><comment><ext-link ext-link-type="uri" xlink:href="http://www.esri.com/software/arcgis/extensions/businessanalyst/data-us-prem.html">http://www.esri.com/software/arcgis/extensions/businessanalyst/data-us-prem.html</ext-link></comment><date-in-citation>accessed 17 Jun2012</date-in-citation></element-citation></ref><ref id="R34"><label>34</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wood</surname><given-names>SN</given-names></name></person-group><article-title>Thin plate regression splines</article-title><source>J R Stat Soc: Series B (Statistical Methodology)</source><year>2003</year><volume>65</volume><fpage>95</fpage><lpage>114</lpage><pub-id pub-id-type="doi">10.1111/1467-9868.00374</pub-id></element-citation></ref><ref id="R35"><label>35</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wahba</surname><given-names>G</given-names></name></person-group><article-title>Spline interpolation and smoothing on the sphere</article-title><source>SIAM Journal on Scientific and Statistical Computing</source><year>1981</year><volume>2</volume><fpage>5</fpage><lpage>16</lpage></element-citation></ref><ref id="R36"><label>36</label><element-citation publication-type="book"><collab>Esri</collab><source>ArcGIS Desktop</source><publisher-loc>Redlands, CA</publisher-loc><publisher-name>Environmental Systems Research Institute</publisher-name><year>2010</year></element-citation></ref><ref id="R37"><label>37</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wood</surname><given-names>SN</given-names></name></person-group><article-title>Fast stable direct fitting and smoothness selection for generalized additive models</article-title><source>Journal of the Royal Statistical Society: Series B (Statistical Methodology)</source><year>2008</year><volume>70</volume><fpage>495</fpage><lpage>518</lpage><pub-id pub-id-type="doi">10.1111/j.1467-9868.2007.00646.x</pub-id></element-citation></ref><ref id="R38"><label>38</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Golub</surname><given-names>GH</given-names></name><name><surname>Van Loan</surname><given-names>CF</given-names></name></person-group><source>Matrix computations 1996</source><publisher-name>Johns Hopkins University, Press</publisher-name><publisher-loc>Baltimore, MD, USA</publisher-loc><year>1983</year></element-citation></ref><ref id="R39"><label>39</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname><given-names>H</given-names></name><name><surname>Hastie</surname><given-names>T</given-names></name><name><surname>Tibshirani</surname><given-names>R</given-names></name></person-group><article-title>Sparse principal component analysis</article-title><source>J Comp Graph Stat</source><year>2006</year><volume>15</volume><fpage>265</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1198/106186006X113430</pub-id></element-citation></ref><ref id="R40"><label>40</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schwarz</surname><given-names>G</given-names></name></person-group><article-title>Estimating the dimension of a model</article-title><source>Ann Statist</source><year>1978</year><volume>6</volume><fpage>461</fpage><lpage>4</lpage></element-citation></ref><ref id="R41"><label>41</label><element-citation publication-type="book"><collab>R Development Core Team</collab><source>R: Language and Environment for Statistical Computing</source><publisher-loc>Vienna, Austria</publisher-loc><publisher-name>R Foundation for Statistical Computing</publisher-name><year>2011</year><comment><ext-link ext-link-type="uri" xlink:href="http://www.R-project.org">http://www.R-project.org</ext-link></comment></element-citation></ref><ref id="R42"><label>42</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>McGinnis</surname><given-names>JM</given-names></name><name><surname>Williams-Russo</surname><given-names>P</given-names></name><name><surname>Knickman</surname><given-names>JR</given-names></name></person-group><article-title>The case for more active policy attention to health promotion</article-title><source>Health Aff (Millwood)</source><year>2002</year><volume>21</volume><fpage>78</fpage><lpage>93</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.21.2.78</pub-id><pub-id pub-id-type="pmid">11900188</pub-id></element-citation></ref><ref id="R43"><label>43</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lodge</surname><given-names>CJ</given-names></name><name><surname>Allen</surname><given-names>KJ</given-names></name><name><surname>Lowe</surname><given-names>AJ</given-names></name><etal/></person-group><article-title>Perinatal cat and dog exposure and the risk of asthma and allergy in the urban environment: a systematic review of longitudinal studies</article-title><source>Clin Dev Immunol</source><year>2012</year><volume>2012</volume><fpage>176484</fpage><pub-id pub-id-type="doi">10.1155/2012/176484</pub-id><pub-id pub-id-type="pmid">22235226</pub-id></element-citation></ref><ref id="R44"><label>44</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Smallwood</surname><given-names>J</given-names></name><name><surname>Ownby</surname><given-names>D</given-names></name></person-group><article-title>Exposure to Dog Allergens and Subsequent Allergic Sensitization: An Updated Review</article-title><source>Current Allergy and Asthma Reports</source><comment>Published Online First: 9 June 2012</comment><pub-id pub-id-type="doi">10.1007/s11882-012-0277-0</pub-id></element-citation></ref><ref id="R45"><label>45</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Breda</surname><given-names>D</given-names></name><name><surname>Freitas</surname><given-names>PF</given-names></name><name><surname>Pizzichini</surname><given-names>E</given-names></name><etal/></person-group><article-title>Prevalence of asthma symptoms and risk factors among adolescents in Tubar&#x000e3;o and Capivari de Baixo, Santa Catarina State, Brazil</article-title><source>Cad Saude Publica</source><year>2009</year><volume>25</volume><fpage>2497</fpage><lpage>506</lpage><pub-id pub-id-type="pmid">19936487</pub-id></element-citation></ref><ref id="R46"><label>46</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Butler</surname><given-names>S</given-names></name><name><surname>Williams</surname><given-names>M</given-names></name><name><surname>Tukuitonga</surname><given-names>C</given-names></name><etal/></person-group><article-title>Problems with damp and cold housing among Pacific families in New Zealand</article-title><source>N Z Med J</source><year>2003</year><volume>116</volume><fpage>U494</fpage><pub-id pub-id-type="pmid">12861308</pub-id></element-citation></ref><ref id="R47"><label>47</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roof</surname><given-names>K</given-names></name><name><surname>Oleru</surname><given-names>N</given-names></name></person-group><article-title>Public health: Seattle and King County&#x02019;s push for the built environment</article-title><source>J Environ Health</source><year>2008</year><volume>71</volume><fpage>24</fpage><lpage>7</lpage><pub-id pub-id-type="pmid">18724501</pub-id></element-citation></ref><ref id="R48"><label>48</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Marra</surname><given-names>CA</given-names></name><name><surname>Lynd</surname><given-names>LD</given-names></name><name><surname>Harvard</surname><given-names>SS</given-names></name><etal/></person-group><article-title>Agreement between aggregate and individual-level measures of income and education: a comparison across three patient groups</article-title><source>BMC Health Serv Res</source><year>2011</year><volume>11</volume><fpage>69</fpage><pub-id pub-id-type="doi">10.1186/1472-6963-11-69</pub-id><pub-id pub-id-type="pmid">21453534</pub-id></element-citation></ref><ref id="R49"><label>49</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Demissie</surname><given-names>K</given-names></name><name><surname>Hanley</surname><given-names>JA</given-names></name><name><surname>Menzies</surname><given-names>D</given-names></name><etal/></person-group><article-title>Agreement in measuring socio-economic status: area-based versus individual measures</article-title><source>Chronic Dis Can</source><year>2000</year><volume>21</volume><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="pmid">10813687</pub-id></element-citation></ref><ref id="R50"><label>50</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sin</surname><given-names>DD</given-names></name><name><surname>Svenson</surname><given-names>LW</given-names></name><name><surname>Man</surname><given-names>SF</given-names></name></person-group><article-title>Do area-based markers of poverty accurately measure personal poverty?</article-title><source>Can J Public Health</source><year>2001</year><volume>92</volume><fpage>184</fpage><lpage>7</lpage><pub-id pub-id-type="pmid">11496626</pub-id></element-citation></ref><ref id="R51"><label>51</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Southern</surname><given-names>DA</given-names></name><name><surname>McLaren</surname><given-names>L</given-names></name><name><surname>Hawe</surname><given-names>P</given-names></name><etal/></person-group><article-title>Individual-level and neighborhood-level income measures: agreement and association with outcomes in a cardiac disease cohort</article-title><source>Med Care</source><year>2005</year><volume>43</volume><fpage>1116</fpage><lpage>22</lpage><pub-id pub-id-type="pmid">16224305</pub-id></element-citation></ref><ref id="R52"><label>52</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Palaniappan</surname><given-names>LP</given-names></name><name><surname>Maxwell</surname><given-names>AE</given-names></name><name><surname>Crespi</surname><given-names>CM</given-names></name><etal/></person-group><article-title>Population Colorectal Cancer Screening Estimates: Comparing Self-Report to Electronic Health Record Data in California</article-title><source>Int J Canc Prev</source><year>2011</year><fpage>4</fpage></element-citation></ref><ref id="R53"><label>53</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bailey</surname><given-names>LC</given-names></name><name><surname>Milov</surname><given-names>DE</given-names></name><name><surname>Kelleher</surname><given-names>K</given-names></name><etal/></person-group><article-title>Multi-Institutional Sharing of Electronic Health Record Data to Assess Childhood Obesity</article-title><source>PLoS ONE</source><year>2013</year><volume>8</volume><fpage>e66192</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0066192</pub-id><pub-id pub-id-type="pmid">23823186</pub-id></element-citation></ref><ref id="R54"><label>54</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Viol&#x000e1;n</surname><given-names>C</given-names></name><name><surname>Foguet-Boreu</surname><given-names>Q</given-names></name><name><surname>Hermosilla-P&#x000e9;rez</surname><given-names>E</given-names></name><etal/></person-group><article-title>Comparison of the information provided by electronic health records data and a population health survey to estimate prevalence of selected health conditions and multimorbidity</article-title><source>BMC Public Health</source><year>2013</year><volume>13</volume><fpage>251</fpage><pub-id pub-id-type="doi">10.1186/1471-2458-13-251</pub-id><pub-id pub-id-type="pmid">23517342</pub-id></element-citation></ref><ref id="R55"><label>55</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Esteban-Vasallo</surname><given-names>MD</given-names></name><name><surname>Dom&#x000ed;nguez-Berj&#x000f3;n</surname><given-names>MF</given-names></name><name><surname>Astray-Mochales</surname><given-names>J</given-names></name><etal/></person-group><article-title>Epidemiological usefulness of population-based electronic clinical records in primary care: estimation of the prevalence of chronic diseases</article-title><source>Fam Pract</source><year>2009</year><volume>26</volume><fpage>445</fpage><lpage>54</lpage><pub-id pub-id-type="doi">10.1093/fampra/cmp062</pub-id><pub-id pub-id-type="pmid">19815673</pub-id></element-citation></ref><ref id="R56"><label>56</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Cricelli</surname><given-names>C</given-names></name><name><surname>Mazzaglia</surname><given-names>G</given-names></name><name><surname>Samani</surname><given-names>F</given-names></name><etal/></person-group><article-title>Prevalence estimates for chronic diseases in Italy: exploring the differences between self-report and primary care databases</article-title><source>J Public Health Med</source><year>2003</year><volume>25</volume><fpage>254</fpage><lpage>7</lpage><pub-id pub-id-type="pmid">14575204</pub-id></element-citation></ref><ref id="R57"><label>57</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Juhn</surname><given-names>Y</given-names></name><name><surname>Kung</surname><given-names>A</given-names></name><name><surname>Voigt</surname><given-names>R</given-names></name><etal/></person-group><article-title>Characterisation of children&#x02019;s asthma status by ICD-9 code and criteria-based medical record review</article-title><source>Prim Care Respir J</source><year>2011</year><volume>20</volume><fpage>79</fpage><lpage>83</lpage><pub-id pub-id-type="doi">10.4104/pcrj.2010.00076</pub-id><pub-id pub-id-type="pmid">21063669</pub-id></element-citation></ref><ref id="R58"><label>58</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ackermann</surname><given-names>RT</given-names></name><name><surname>Finch</surname><given-names>EA</given-names></name><name><surname>Brizendine</surname><given-names>E</given-names></name><etal/></person-group><article-title>Translating the Diabetes Prevention Program into the community. The DEPLOY Pilot Study</article-title><source>Am J Prev Med</source><year>2008</year><volume>35</volume><fpage>357</fpage><lpage>63</lpage><pub-id pub-id-type="doi">10.1016/j.amepre.2008.06.035</pub-id><pub-id pub-id-type="pmid">18779029</pub-id></element-citation></ref><ref id="R59"><label>59</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Besag</surname><given-names>J</given-names></name></person-group><article-title>Spatial interaction and the statistical analysis of lattice systems</article-title><source>Journal of the Royal Statistical Society Series B (Methodological)</source><year>1974</year><volume>36</volume><fpage>192</fpage><lpage>236</lpage></element-citation></ref><ref id="R60"><label>60</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brunsdon</surname><given-names>C</given-names></name><name><surname>Fotheringham</surname><given-names>S</given-names></name><name><surname>Charlton</surname><given-names>M</given-names></name></person-group><article-title>Geographically weighted regression</article-title><source>Journal of the Royal Statistical Society: Series D (The Statistician)</source><year>1998</year><volume>47</volume><fpage>431</fpage><lpage>43</lpage></element-citation></ref><ref id="R61"><label>61</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sampson</surname><given-names>RJ</given-names></name><name><surname>Morenoff</surname><given-names>JD</given-names></name><name><surname>Gannon-Rowley</surname><given-names>T</given-names></name></person-group><article-title>Assessing &#x02018;neighborhood effects&#x02019;: Social Processes and New Directions in Research</article-title><source>Annual Review of Sociology</source><year>2002</year><volume>28</volume><fpage>443</fpage><lpage>78</lpage><pub-id pub-id-type="doi">10.1146/annurev.soc.28.110601.141114</pub-id></element-citation></ref><ref id="R62"><label>62</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Newton</surname><given-names>KM</given-names></name><name><surname>Peissig</surname><given-names>PL</given-names></name><name><surname>Kho</surname><given-names>AN</given-names></name><etal/></person-group><article-title>Validation of electronic medical record-based phenotyping algorithms: results and lessons learned from the eMERGE network</article-title><source>J Am Med Inform Assoc</source><year>2013</year><volume>20</volume><fpage>e147</fpage><lpage>154</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2012-000896</pub-id><pub-id pub-id-type="pmid">23531748</pub-id></element-citation></ref><ref id="R63"><label>63</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Carroll</surname><given-names>RJ</given-names></name><name><surname>Thompson</surname><given-names>WK</given-names></name><name><surname>Eyler</surname><given-names>AE</given-names></name><etal/></person-group><article-title>Portability of an algorithm to identify rheumatoid arthritis in electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><fpage>e162</fpage><lpage>169</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000583</pub-id><pub-id pub-id-type="pmid">22374935</pub-id></element-citation></ref><ref id="R64"><label>64</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Peissig</surname><given-names>PL</given-names></name><name><surname>Rasmussen</surname><given-names>LV</given-names></name><name><surname>Berg</surname><given-names>RL</given-names></name><etal/></person-group><article-title>Importance of multi-modal approaches to effectively identify cataract cases from electronic health records</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><fpage>225</fpage><lpage>34</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000456</pub-id><pub-id pub-id-type="pmid">22319176</pub-id></element-citation></ref><ref id="R65"><label>65</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gershon</surname><given-names>AS</given-names></name><name><surname>Wang</surname><given-names>C</given-names></name><name><surname>Guan</surname><given-names>J</given-names></name><etal/></person-group><article-title>Identifying patients with physician-diagnosed asthma in health administrative databases</article-title><source>Can Respir J</source><year>2009</year><volume>16</volume><fpage>183</fpage><lpage>8</lpage><pub-id pub-id-type="pmid">20011725</pub-id></element-citation></ref><ref id="R66"><label>66</label><element-citation publication-type="book"><collab>National Committee for Quality Assurance</collab><source>HEDIS Technical Specifications</source><publisher-loc>Washington, D.C</publisher-loc><publisher-name>National Committee for Quality Assurance</publisher-name><year>2008</year></element-citation></ref><ref id="R67"><label>67</label><element-citation publication-type="book"><person-group person-group-type="author"><name><surname>Taylor</surname><given-names>P</given-names></name><name><surname>Morin</surname><given-names>R</given-names></name><name><surname>Cohn</surname><given-names>DV</given-names></name><etal/></person-group><source>American Mobility: Who Moves? Who Stays Put? Where&#x02019;s Home?</source><publisher-loc>Washington, D.C</publisher-loc><publisher-name>Pew Research Center</publisher-name><year>2008</year><comment><ext-link ext-link-type="uri" xlink:href="http://pewsocialtrends.org/files/2011/04/American-Mobility-Report-updated-12-29-08.pdf">http://pewsocialtrends.org/files/2011/04/American-Mobility-Report-updated-12-29-08.pdf</ext-link></comment><date-in-citation>accessed 21 May2013</date-in-citation></element-citation></ref></ref-list><app-group><app id="APP1"><title>APPENDIX A. <xref rid="SD1" ref-type="supplementary-material">Supplementary material</xref></title><p id="P53"><xref rid="SD1" ref-type="supplementary-material">Supplementary data</xref> associated with this article can be found in the online version.</p></app></app-group></back><floats-group><fig id="F1" orientation="portrait" position="float"><label>Figure 1</label><caption><title>Major Cities and County Population in Wisconsin and Total Number of Participants Per Block Group from UW eHealth-PHINEX</title><p>Major Wisconsin cities and population by county (a) and the total number of participants per block group in UW eHealth-PHINEX (b) are shown. White block groups do not contain any patient data. The light yellow block groups in (b) correspond to block groups with &#x02264; 20 total participants.</p></caption><graphic xlink:href="nihms651282f1"/></fig><fig id="F2" orientation="portrait" position="float"><label>Figure 2</label><caption><title>Asthma Prevalence and Logistic Thin Plate Regression Spline Model Predicted Prevalence</title><p>Asthma prevalence (a) and Logistic thin plate regression spline model predicted prevalence (b). The Logistic model only contains the thin plate regression spline smooth term. Two color maps are used to highlight areas of less or more confidence: blue for block groups with &#x02264; 20 participants and red for block groups with &#x0003e; 20 total participants. White block groups do not contain any patient data. As intended, the regression model creates a smoother spatially predicted prevalence and decreases extreme values, resulting in more moderate (less extremely dark and extremely light) blue and red coloring.</p></caption><graphic xlink:href="nihms651282f2"/></fig><fig id="F3" orientation="portrait" position="float"><label>Figure 3</label><caption><title>Spatial Change in Log Odds for Sparse Principal Components 2 and 4</title><p>The change in log odds of asthma diagnosis per unit measure of sparse principal components 2 and 4 is shown at each block group. White represents a change in log odds between 0 and 0.01. The blue gradient represents a change in log odds &#x0003c;0, and the red gradient represents a change in log odds &#x0003e;0.01. There was a positive change in log odds of asthma diagnosis in rural areas while there was a negative change in log odds in the urban areas of Madison and Milwaukee for sparse principal component 2 (a). There was a positive change in log odds of asthma diagnosis in eastern areas of Wisconsin (b).</p></caption><graphic xlink:href="nihms651282f3"/></fig><table-wrap id="T1" position="float" orientation="portrait"><label>Table 1</label><caption><p>Odds ratios for variables in the Logistic thin plate regression spline model</p></caption><table frame="hsides" rules="groups"><thead><tr><th valign="bottom" align="left" rowspan="1" colspan="1"/><th valign="bottom" align="center" rowspan="1" colspan="1">OR (95% CI)</th></tr></thead><tbody><tr><td colspan="2" align="left" valign="top" rowspan="1"><bold>Sex</bold></td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Male</td><td align="center" valign="top" rowspan="1" colspan="1">reference</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Female</td><td align="center" valign="top" rowspan="1" colspan="1">1.00 (0.96, 1.05)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Age (per 10 years)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">0.84 (0.82, 0.85)</td></tr><tr><td colspan="2" align="left" valign="top" rowspan="1"><bold>Race</bold></td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">White</td><td align="center" valign="top" rowspan="1" colspan="1">reference</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Black</td><td align="center" valign="top" rowspan="1" colspan="1">1.78 (1.63, 1.94)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Asian</td><td align="center" valign="top" rowspan="1" colspan="1">0.66 (0.57, 0.77)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">American Indian</td><td align="center" valign="top" rowspan="1" colspan="1">1.25 (1.00, 1.56)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Hawaiian or Pacific Islander</td><td align="center" valign="top" rowspan="1" colspan="1">1.29 (0.77, 2.18)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Unknown</td><td align="center" valign="top" rowspan="1" colspan="1">0.81 (0.68, 0.96)</td></tr><tr><td colspan="2" align="left" valign="top" rowspan="1"><bold>Ethnicity</bold></td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Non-Hispanic</td><td align="center" valign="top" rowspan="1" colspan="1">reference</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Hispanic</td><td align="center" valign="top" rowspan="1" colspan="1">0.79 (0.69, 0.90)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1">Unknown</td><td align="center" valign="top" rowspan="1" colspan="1">0.81 (0.68, 0.96)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>BMI (per 5 kg/m<sup>2</sup>)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">1.18 (1.16, 1.20)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Encounter days in EHR dataset (per 30 days)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">1.05 (1.04, 1.05)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Distance to clinic (per 10 mile)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">1.02 (1.00, 1.03)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Sparse Principal Component 1 (per 5 units)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">1.00 (0.96, 1.05)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Sparse Principal Component 2 (per 5 units)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">0.95 (0.89, 0.99)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Sparse Principal Component 3 (per 5 units)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">0.94 (0.86, 1.03)</td></tr><tr><td align="left" valign="top" rowspan="1" colspan="1"><bold>Sparse Principal Component 4 (per 5 units)</bold></td><td align="center" valign="top" rowspan="1" colspan="1">1.13 (1.01, 1.27)</td></tr></tbody></table><table-wrap-foot><fn id="TFN1"><p>OR, odds ratio, CI, confidence interval, BMI, body mass index, EHR, electronic health record Age, BMI, encounter days in EHR, distance to clinic, and sparse principal component odds ratios are scaled by 10 years, 5 kg/m<sup>2</sup>, 30 days, 10 miles, and 5 units, respectively</p></fn></table-wrap-foot></table-wrap><table-wrap id="T2" position="float" orientation="portrait"><label>Table 2</label><caption><p>Representative Variables from Sparse Principal Components</p></caption><table frame="void" rules="groups"><thead><tr><th valign="middle" align="center" rowspan="1" colspan="1">Sparse Principal Component</th><th valign="middle" align="center" rowspan="1" colspan="1">Variable</th><th valign="middle" align="center" rowspan="1" colspan="1">SPC Loading</th><th valign="middle" align="center" rowspan="1" colspan="1">Model Coefficient (<italic>&#x003b4;</italic>)</th></tr></thead><tbody><tr><td align="center" valign="top" rowspan="1" colspan="1">1</td><td align="center" valign="top" rowspan="1" colspan="1">Food at Home: Average</td><td align="center" valign="top" rowspan="1" colspan="1">0.36</td><td align="center" valign="top" rowspan="1" colspan="1">6.1&#x000d7;10<sup>&#x02212;4</sup></td></tr><tr><td align="center" valign="top" rowspan="1" colspan="1">2</td><td align="center" valign="top" rowspan="1" colspan="1">Household owns 1 dog</td><td align="center" valign="top" rowspan="1" colspan="1">0.49</td><td align="center" valign="top" rowspan="1" colspan="1">&#x02212;1.1&#x000d7;10<sup>&#x02212;2</sup></td></tr><tr><td align="center" valign="top" rowspan="1" colspan="1"/><td align="center" valign="top" rowspan="1" colspan="1">Renter Occupied Housing Units</td><td align="center" valign="top" rowspan="1" colspan="1">&#x02212;0.41</td><td align="center" valign="top" rowspan="1" colspan="1"/></tr><tr><td align="center" valign="top" rowspan="1" colspan="1">3</td><td align="center" valign="top" rowspan="1" colspan="1">Average Household Size</td><td align="center" valign="top" rowspan="1" colspan="1">0.51</td><td align="center" valign="top" rowspan="1" colspan="1">&#x02212;1.2&#x000d7;10<sup>&#x02212;2</sup></td></tr><tr><td align="center" valign="top" rowspan="1" colspan="1">4</td><td align="center" valign="top" rowspan="1" colspan="1">Households with Disposable Income less than $15,000</td><td align="center" valign="top" rowspan="1" colspan="1">0.76</td><td align="center" valign="top" rowspan="1" colspan="1">2.5&#x000d7;10<sup>&#x02212;2</sup></td></tr></tbody></table><table-wrap-foot><fn id="TFN2"><p>SPC, sparse principal component</p></fn></table-wrap-foot></table-wrap><boxed-text id="BX1" position="float" orientation="portrait"><caption><title>HIGHLIGHTS</title></caption><list list-type="bullet" id="L1"><list-item><p id="P49">We geocode patients from an electronic health record to their corresponding block group.</p></list-item><list-item><p id="P50">We identify sparse environmental variables associated with asthma considering spatial variation.</p></list-item><list-item><p id="P51">Sparse principal component analysis and logistic thin plate regression splines were utilized.</p></list-item><list-item><p id="P52">Dogs and rental housing were associated with asthma in specific regions.</p></list-item></list></boxed-text></floats-group></article>