<!DOCTYPE article
PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD with MathML3 v1.2 20190208//EN" "JATS-archivearticle1-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article"><?properties open_access?><front><journal-meta><journal-id journal-id-type="nlm-ta">Malar J</journal-id><journal-id journal-id-type="iso-abbrev">Malar J</journal-id><journal-title-group><journal-title>Malaria Journal</journal-title></journal-title-group><issn pub-type="epub">1475-2875</issn><publisher><publisher-name>BioMed Central</publisher-name><publisher-loc>London</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="pmid">33593329</article-id><article-id pub-id-type="pmc">7885407</article-id><article-id pub-id-type="publisher-id">3624</article-id><article-id pub-id-type="doi">10.1186/s12936-021-03624-2</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research</subject></subj-group></article-categories><title-group><article-title>Development of a new barcode-based, multiplex-PCR, next-generation-sequencing assay and data processing and analytical pipeline for multiplicity of infection detection of <italic>Plasmodium falciparum</italic></article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name><surname>Mitchell</surname><given-names>Rebecca M.</given-names></name><xref ref-type="aff" rid="Aff1">1</xref><xref ref-type="aff" rid="Aff2">2</xref><xref ref-type="aff" rid="Aff3">3</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name><surname>Zhou</surname><given-names>Zhiyong</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Sheth</surname><given-names>Mili</given-names></name><xref ref-type="aff" rid="Aff4">4</xref></contrib><contrib contrib-type="author"><name><surname>Sergent</surname><given-names>Sheila</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Frace</surname><given-names>Michael</given-names></name><xref ref-type="aff" rid="Aff4">4</xref></contrib><contrib contrib-type="author"><name><surname>Nayak</surname><given-names>Vishal</given-names></name><xref ref-type="aff" rid="Aff5">5</xref></contrib><contrib contrib-type="author"><name><surname>Hu</surname><given-names>Bin</given-names></name><xref ref-type="aff" rid="Aff5">5</xref></contrib><contrib contrib-type="author"><name><surname>Gimnig</surname><given-names>John</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>ter Kuile</surname><given-names>Feiko</given-names></name><xref ref-type="aff" rid="Aff6">6</xref></contrib><contrib contrib-type="author"><name><surname>Lindblade</surname><given-names>Kim</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Slutsker</surname><given-names>Laurence</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Hamel</surname><given-names>Mary J.</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Desai</surname><given-names>Meghna</given-names></name><xref ref-type="aff" rid="Aff1">1</xref></contrib><contrib contrib-type="author"><name><surname>Otieno</surname><given-names>Kephas</given-names></name><xref ref-type="aff" rid="Aff7">7</xref></contrib><contrib contrib-type="author"><name><surname>Kariuki</surname><given-names>Simon</given-names></name><xref ref-type="aff" rid="Aff7">7</xref></contrib><contrib contrib-type="author" corresp="yes"><name><surname>Vigfusson</surname><given-names>Ymir</given-names></name><address><email>ymir.vigfusson@emory.edu</email></address><xref ref-type="aff" rid="Aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name><surname>Shi</surname><given-names>Ya Ping</given-names></name><address><email>yps0@cdc.gov</email></address><xref ref-type="aff" rid="Aff1">1</xref></contrib><aff id="Aff1"><label>1</label><institution-wrap><institution-id institution-id-type="GRID">grid.416738.f</institution-id><institution-id institution-id-type="ISNI">0000 0001 2163 0069</institution-id><institution>Division of Parasitic Diseases, Center for Global Health, </institution><institution>Centers for Disease Control and Prevention (CDC), </institution></institution-wrap>Atlanta, USA </aff><aff id="Aff2"><label>2</label><institution-wrap><institution-id institution-id-type="GRID">grid.189967.8</institution-id><institution-id institution-id-type="ISNI">0000 0001 0941 6502</institution-id><institution>Department of Computer Science, </institution><institution>Emory University, </institution></institution-wrap>Atlanta, USA </aff><aff id="Aff3"><label>3</label><institution-wrap><institution-id institution-id-type="GRID">grid.189967.8</institution-id><institution-id institution-id-type="ISNI">0000 0001 0941 6502</institution-id><institution>School of Nursing, </institution><institution>Emory University, </institution></institution-wrap>Atlanta, USA </aff><aff id="Aff4"><label>4</label><institution-wrap><institution-id institution-id-type="GRID">grid.416738.f</institution-id><institution-id institution-id-type="ISNI">0000 0001 2163 0069</institution-id><institution>Biotechnology Core Facility Branch, Division of Scientific Resources, </institution><institution>CDC, </institution></institution-wrap>Atlanta, USA </aff><aff id="Aff5"><label>5</label><institution-wrap><institution-id institution-id-type="GRID">grid.416738.f</institution-id><institution-id institution-id-type="ISNI">0000 0001 2163 0069</institution-id><institution>Office of Infectious Diseases, </institution><institution>National Center for Emerging and Zoonotic Infectious Diseases, CDC, </institution></institution-wrap>Atlanta, USA </aff><aff id="Aff6"><label>6</label><institution-wrap><institution-id institution-id-type="GRID">grid.48004.38</institution-id><institution-id institution-id-type="ISNI">0000 0004 1936 9764</institution-id><institution>Liverpool School of Tropical Medicine, </institution></institution-wrap>Liverpool, UK </aff><aff id="Aff7"><label>7</label><institution-wrap><institution-id institution-id-type="GRID">grid.33058.3d</institution-id><institution-id institution-id-type="ISNI">0000 0001 0155 5938</institution-id><institution>Kenya Medical Research Institute, </institution><institution>Centre for Global Health Research, </institution></institution-wrap>Kisumu, Kenya </aff></contrib-group><pub-date pub-type="epub"><day>16</day><month>2</month><year>2021</year></pub-date><pub-date pub-type="pmc-release"><day>16</day><month>2</month><year>2021</year></pub-date><pub-date pub-type="collection"><year>2021</year></pub-date><volume>20</volume><elocation-id>92</elocation-id><history><date date-type="received"><day>10</day><month>6</month><year>2020</year></date><date date-type="accepted"><day>4</day><month>2</month><year>2021</year></date></history><permissions><copyright-statement>&#x000a9; The Author(s) 2021</copyright-statement><license license-type="OpenAccess"><license-p><bold>Open Access</bold>This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made. The images or other third party material in this article are included in the article's Creative Commons licence, unless indicated otherwise in a credit line to the material. If material is not included in the article's Creative Commons licence and your intended use is not permitted by statutory regulation or exceeds the permitted use, you will need to obtain permission directly from the copyright holder. To view a copy of this licence, visit <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">http://creativecommons.org/licenses/by/4.0/</ext-link>. The Creative Commons Public Domain Dedication waiver (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/publicdomain/zero/1.0/">http://creativecommons.org/publicdomain/zero/1.0/</ext-link>) applies to the data made available in this article, unless otherwise stated in a credit line to the data.</license-p></license></permissions><abstract id="Abs1"><sec><title>Background</title><p id="Par1">Simultaneous infection with multiple malaria parasite strains is common in high transmission areas. Quantifying the number of strains per host, or the multiplicity of infection (MOI), provides additional parasite indices for assessing transmission levels but it is challenging to measure accurately with current tools. This paper presents new laboratory and analytical methods for estimating the MOI of <italic>Plasmodium falciparum</italic>.</p></sec><sec><title>Methods</title><p id="Par2">Based on 24 single nucleotide polymorphisms (SNPs) previously identified as stable, unlinked targets across 12 of the 14 chromosomes within <italic>P. falciparum</italic> genome, three multiplex PCRs of short target regions and subsequent next generation sequencing (NGS) of the amplicons were developed. A bioinformatics pipeline including B4Screening pathway removed spurious amplicons to ensure consistent frequency calls at each SNP location, compiled amplicons by SNP site diversity, and performed algorithmic haplotype and strain reconstruction. The pipeline was validated by 108 samples generated from cultured-laboratory strain mixtures in different proportions and concentrations, with and without pre-amplification, and using whole blood and dried blood spots (DBS). The pipeline was applied to 273 smear-positive samples from surveys conducted in western Kenya, then providing results into StrainRecon Thresholding for Infection Multiplicity (STIM), a novel MOI estimator.</p></sec><sec><title>Results</title><p id="Par3">The 24 barcode SNPs were successfully identified uniformly across the 12 chromosomes of <italic>P. falciparum</italic> in a sample using the pipeline. Pre-amplification and parasite concentration, while non-linearly associated with SNP read depth, did not influence the SNP frequency calls. Based on consistent SNP frequency calls at targeted locations, the algorithmic strain reconstruction for each laboratory-mixed sample had 98.5% accuracy in dominant strains. STIM detected up to 5 strains in field samples from western Kenya and showed declining MOI over time (<italic>q</italic>&#x02009;&#x0003c;&#x02009;0.02), from 4.32 strains per infected person in 1996 to 4.01, 3.56 and 3.35 in 2001, 2007 and 2012, and a reduction in the proportion of samples with 5 strains from 57% in 1996 to 18% in 2012.</p></sec><sec><title>Conclusion</title><p id="Par4">The combined approach of new multiplex PCRs and NGS, the unique bioinformatics pipeline and STIM could identify 24 barcode SNPs <italic>of P. falciparum</italic> correctly and consistently. The methodology could be applied to field samples to reliably measure temporal changes in MOI.</p></sec></abstract><kwd-group xml:lang="en"><title>Keywords</title><kwd><italic>Plasmodium falciparum</italic></kwd><kwd>Transmission</kwd><kwd>Multiplicity of infection</kwd><kwd>Haplotype and strain</kwd><kwd>StrainRecon</kwd><kwd>MOI estimation</kwd><kwd>STIM</kwd></kwd-group><funding-group><award-group><funding-source><institution-wrap><institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100000030</institution-id><institution>Centers for Disease Control and Prevention</institution></institution-wrap></funding-source><award-id>AMD-132inc</award-id><principal-award-recipient><name><surname>Shi</surname><given-names>Ya Ping</given-names></name></principal-award-recipient></award-group></funding-group><funding-group><award-group><funding-source><institution-wrap><institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100000001</institution-id><institution>National Science Foundation</institution></institution-wrap></funding-source><award-id>CAREER grant #1553579</award-id><principal-award-recipient><name><surname>Vigfusson</surname><given-names>Ymir</given-names></name></principal-award-recipient></award-group></funding-group><custom-meta-group><custom-meta><meta-name>issue-copyright-statement</meta-name><meta-value>&#x000a9; The Author(s) 2021</meta-value></custom-meta></custom-meta-group></article-meta></front><body><sec id="Sec1"><title>Background</title><p id="Par20">Malaria infection remains a major public health problem in sub-tropical and tropical areas. <italic>Plasmodium falciparum</italic> is responsible for most malaria-attributed morbidity and mortality [<xref ref-type="bibr" rid="CR1">1</xref>]. Accurately and timely measuring of the change in <italic>P. falciparum</italic> transmission levels is not only important in interpretation of data from epidemiological investigations and transmission-reducing intervention studies, but also is essential in the impact evaluation of programmatic activities on malaria transmission. Traditionally, the entomological inoculation rate (EIR) has been used for measuring malaria transmission level mostly for epidemiological studies [<xref ref-type="bibr" rid="CR2">2</xref>]. However, EIR is not suitable for obtaining estimates of transmission level rapidly and its accuracy has been questioned in some studies [<xref ref-type="bibr" rid="CR3">3</xref>&#x02013;<xref ref-type="bibr" rid="CR5">5</xref>]. Epidemiological approaches, such as cohort infection incidence studies, parasite prevalence surveys and passive case incidence data, are frequently used to measure transmission levels, but cohort incidence studies with relatively high precision are expensive and time consuming, and other malaria metrics may be subject to a number of biases [<xref ref-type="bibr" rid="CR3">3</xref>]. Moreover, several malaria metrics could exhibit non-linear scaling relationships [<xref ref-type="bibr" rid="CR3">3</xref>]. In pioneering work, measuring the multiplicity of infection (MOI, defined as the number of concurrent parasite strains per parasite-positive host) using molecular genotyping tools showed that MOI positively correlates with endemicity [<xref ref-type="bibr" rid="CR6">6</xref>&#x02013;<xref ref-type="bibr" rid="CR8">8</xref>] and has been considered and proposed as an adjunct metric for characterizing malaria transmission [<xref ref-type="bibr" rid="CR3">3</xref>, <xref ref-type="bibr" rid="CR9">9</xref>]. MOI can be the result of multiple mosquito bites (superinfection), a single mosquito bite (co-transmission) [<xref ref-type="bibr" rid="CR10">10</xref>, <xref ref-type="bibr" rid="CR11">11</xref>], or both. The MOI metric, assuming adequate precision, can uncover parasite strain populations that may enhance understanding of transmission dynamics [<xref ref-type="bibr" rid="CR3">3</xref>].</p><p id="Par21">Previously established molecular assays, including size-based polymorphic antigenic genes or neutral microsatellites, have limited power to measure MOI accurately because they cannot determine the true parasite haplotypes within a host [<xref ref-type="bibr" rid="CR12">12</xref>&#x02013;<xref ref-type="bibr" rid="CR14">14</xref>]. Although antigen-targeted or non-antigen-targeted gene (amplicon) deep sequencing could produce the information on gene-specific MOI and improve the sensitivity of minor variant detection [<xref ref-type="bibr" rid="CR14">14</xref>&#x02013;<xref ref-type="bibr" rid="CR16">16</xref>], a single-target, deep sequencing strategy could not reflect the genomic signatures of parasite strains due to only a small genomic region sequenced. In addition, the gene-specific MOIs generated from multiple-target gene deep sequencing could vary even within a study [<xref ref-type="bibr" rid="CR14">14</xref>, <xref ref-type="bibr" rid="CR16">16</xref>], depending on the different multiple targets chosen, the level of host immune pressure on specific antigen, and/or the different extent of diversity in the multiple targets either antigenic or non-antigenic genes.</p><p id="Par22">Whole genome deep sequencing has improved MOI estimation at the population level by detecting genomic signatures and minority strains of parasites through newly developed analytical tools, StrainRecon [<xref ref-type="bibr" rid="CR17">17</xref>] and DEploid [<xref ref-type="bibr" rid="CR18">18</xref>]. The main difference between StrainRecon and DEploid is that DEploid requires a reference panel of strains to be provided as a prior for potential haplotypes present in the sample, whereas StrainRecon, discussed below, requires no templates or priors. Yet whole genome sequencing imposes practical challenges, particularly with respect to the large volume of parasite-infected red blood cell sample that is needed, along with time and cost, which together diminish the feasibility for rapid MOI estimation.</p><p id="Par23">Separately, a molecular barcoding tool for identification and tracking of <italic>P. falciparum</italic> using 24 single nucleotide polymorphism (SNP) markers was developed by Daniels et al. [<xref ref-type="bibr" rid="CR19">19</xref>] as stable, unlinked targets across 12 of the 14 chromosomes within the <italic>P. falciparum</italic> genome. Whereas this 24 SNP individual TaqMan real-time PCR barcode tool is used successfully in pre-elimination or low-endemic areas for detection of a unique fingerprint or signature for a parasite genome [<xref ref-type="bibr" rid="CR20">20</xref>], it fails in environments where individuals have infections with multiple strains since the previously established laboratory and analytical tools cannot classify haplotypes and provide quantitative information on the number of strains within a host [<xref ref-type="bibr" rid="CR19">19</xref>, <xref ref-type="bibr" rid="CR21">21</xref>]. The main objective of this study is therefore to overcome the obstacles in using the identified 24 SNP barcodes for MOI analysis in medium/high transmission areas. Here, the laboratory and algorithmic challenges of performing 24 SNP barcode-based MOI estimation suitable for medium/high transmission areas are addressed above. To this end, a complete pipeline for estimating MOI from blood was built, including molecular tools and numerical algorithms to determine likely barcodes, haplotype and strain number within an individual sample, resulting in a tool that can be useful across different transmission levels.</p><p id="Par24">Described below is the development and validation of an advanced laboratory assay and unique data processing pipeline with B4Screening pathway for strain disambiguation using three multiplex PCRs followed by MiSeq deep sequencing based on the published panel of 24 SNPs of <italic>P. falciparum</italic> [<xref ref-type="bibr" rid="CR19">19</xref>]. Further validation of MOI estimation was conducted on field samples collected over time from Kenya using the recently published algorithm StrainRecon [<xref ref-type="bibr" rid="CR17">17</xref>] and a novel threshold-calibrated MOI estimation method, StrainRecon Thresholding for Infection Multiplicity (STIM) is presented below. The methods developed in this study offer malaria researchers the ability to target multiple genetic loci in sufficient depth to link across sites based on frequency of SNP reads at each site using the non-template approach. The consistency of these SNP frequencies determines the ability of the algorithm to successfully assign frequencies to haplotypes and disentangle samples that comprise of multiple strains [<xref ref-type="bibr" rid="CR17">17</xref>].</p></sec><sec id="Sec2"><title>Methods</title><p id="Par25">Since many different analytical methods are involved in the scope of this study, relevant details around each analysis method are provided in the corresponding section where appropriate. In addition, stepwise workflow is described in each technical section.</p><sec id="Sec3"><title>Ethical considerations</title><p id="Par26">The de-identified testing for Illumina deep sequencing at the CDC Malaria laboratory was determined as non-human subjects research by US CDC. The study protocols, from which field samples were obtained, were reviewed and approved by the Ethics Review Committee of the Kenya Medical Research Institute including blood sample collection and use of the samples for parasite genotyping.</p></sec><sec id="Sec4"><title>Development of multiplexing PCRs with next generation sequencing assay</title><sec id="Sec5"><title>Parasite strains and quantification</title><p id="Par27">Six laboratory-cultured <italic>P. falciparum</italic> strains (D6, D10, 7G8, RO33, V1/S, W2) representing 24 known barcode SNPs were used for assay development. Extraction of DNA from cultured parasites was performed with QIAamp DNA Mini Kit (QIAGEN, Germantown, MD, USA). DNA was quantified using a real-time PCR protocol described by Daniels et al<italic>.</italic> [<xref ref-type="bibr" rid="CR19">19</xref>] with a series of diluted standard curve (10<sup>2</sup>&#x02013;10<sup>5</sup>) of the plasmid targeting a single copy of <italic>Pf07_0076</italic> gene.</p></sec><sec id="Sec6"><title>Preparation of mixed parasites strains</title><p id="Par28">Unlike the detection limit in traditional PCR diagnosis, the lowest limit for detecting a minor strain in a mixed strain infection is compounded by both lowest proportion of the strain and lowest parasite concentration (density). Citrated O<sup>+</sup> whole blood was used for dilution of above quantified samples. Blood was spiked with 10<sup>5</sup>&#x000a0;parasites/&#x000b5;l with 3 different strain combinations (A: D10/D6/V1-S, B: D6/RO33/W2, C: 7G8/V1-S/R033). Each combination was prepared to target in 3 different proportions (1: 97.5&#x02013;2&#x02013;0.5%, 2: 95&#x02013;4&#x02013;1%, 3: 88&#x02013;10&#x02013;2%, respectively). These 9 combination-proportion preparations were then diluted in tenfold dilution series to 10<sup>2</sup>&#x000a0;parasites/&#x000b5;l. Based on the designed lowest proportion (0.5%) and lowest parasite concentration (10<sup>2</sup>&#x000a0;parasites/&#x000b5;l) described above, a target lowest limit for minor strain detection was expected at 0.5&#x000a0;parasite/&#x000b5;l. Each dilution stage was preserved as dried blood spots (50&#x000a0;&#x000b5;l per spot) on Whatman 903 filter paper (DBS) (GE Healthcare, Westborough, MA, USA) and 50-&#x000b5;l liquid aliquots in 1.5&#x000a0;ml sample tubes. The liquid aliquots were immediately frozen at &#x02212;&#x02009;80&#x000a0;&#x000b0;C. The filter paper spots were dried overnight in a biosafety hood and were then stored at &#x02212;&#x02009;80&#x000a0;&#x000b0;C in a Ziploc bag with 4 desiccant packs (IMPAKCorp., Los Angeles, CA, USA) and a moisture indicator. A single DBS (50&#x000a0;&#x000b5;l) and equally a 50&#x000a0;&#x000b5;l of liquid aliquot were used for each DNA isolation using QIAamp DNA Mini Kits (QIAGEN).</p></sec><sec id="Sec7"><title>Multiplex PCR development</title><p id="Par29">The primers for 24 SNPs by individual real-time PCRs described by Daniels et al<italic>.</italic> [<xref ref-type="bibr" rid="CR19">19</xref>] were used for 3 multiplex PCR reactions with the addition of standard 16S sequencing overhang adaptor sequences [Forward overhang: 5&#x02032; TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG&#x02010;(locus-specific primer) and Reverse overhang: 5&#x02032; GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG&#x02010;(locus-specific primer)] [<xref ref-type="bibr" rid="CR22">22</xref>] for use in MiSeq library preparation. Multiplex combinations were designed via PrimerSelect (DNAStar Lasergene, Madison, WI, USA) to minimize primer-dimers (artifacts) in each multiplex reaction. Multiplex combinations were 7-way (SNPs: 2, 3, 6, 8, 10, 15, 23); 8-way (SNPs: 4, 5, 7, 11, 12, 14, 16, 21) and 9-way (SNPs: 1, 9, 13, 17, 18, 19, 20, 22, 24). For each multiplex reaction, a stock primer solution was made of all forward and reverse primers with a concentration of 100&#x000a0;&#x000b5;M each.</p><p id="Par30">Multiplex PCR reaction contained 12.5&#x000a0;&#x000b5;l 2&#x000d7; master mix Platinum Multiplex PCR supermix (ThermoFisher Scientific, Waltham, MA, USA), 100&#x000a0;nM each primer, 1&#x000a0;&#x000b5;l DNA template and PCR water in a 25-&#x000b5;l total reaction volume.&#x000a0;Samples were run with 2&#x000a0;min of initial denaturation at 95&#x000a0;&#x000b0;C, followed by 35 cycles of 30&#x000a0;s denaturation at 95&#x000a0;&#x000b0;C, 1&#x000a0;min and 30&#x000a0;s annealing at 60&#x000a0;&#x000b0;C and 30&#x000a0;s extension at 72&#x000a0;&#x000b0;C, with a final extension at 72&#x000a0;&#x000b0;C for 10&#x000a0;min and final resting temperature of 4&#x000a0;&#x000b0;C.</p><p id="Par31">Following visualization via DNA electrophoresis on a 2% agarose gel with 3&#x000a0;&#x000b5;l of 50&#x000a0;bp Track-it ladder (ThermoFisher Scientific) to ensure amplification, the short PCR products of 3 multiplex PCRs were pooled together for a sample and purified collectively on QIAquick PCR purification columns (QIAGEN). The purified DNA was quantified using a Nano Drop 2000 instrument.</p></sec><sec id="Sec8"><title>Testing with pre-amplification (two-step PCRs)</title><p id="Par32">To evaluate the ability to amplify low frequency SNPs, a comparison test between one-step multiplex PCR described above and a pre-amplification PCR step before multiplex PCR with PreAmp PCR kits (ThermoFisher Scientific) was conducted. The pre-amplification step used the same primers as each individual multiplex at 100&#x000a0;nM each, with 10&#x000a0;&#x000b5;l of TaqMan Preamp Master Mix solution, 1&#x000a0;&#x000b5;l DNA template and PCR water up to 20&#x000a0;&#x000b5;l total reaction volume. Following pre-amplification, products were diluted 1:20 and 1&#x000a0;&#x000b5;l diluents were carried over into the multiplex PCRs described above.</p></sec><sec id="Sec9"><title>Testing with different concentration in laboratory-cultured strains</title><p id="Par33">To evaluate the minimum detection threshold of multiplex PCRs and the NGS (see below for detailed NGS), serial dilutions of <italic>P. falciparum</italic> at 10<sup>2</sup> to 10<sup>5</sup>/&#x000b5;l from the different combinations and proportions of parasite strains were tested in triplicate. All samples with parasite concentration&#x02009;&#x0003c;&#x02009;10<sup>5</sup>/&#x000b5;l were run both with and without a pre-amplification.</p></sec><sec id="Sec10"><title>Testing with different type of samples using laboratory-cultured strains</title><p id="Par34">Both frozen liquid blood samples (50&#x000a0;&#x000b5;l per sample) and DBS samples (50&#x000a0;&#x000b5;l per spot) were tested with and without pre-amplification. In total, final 108 samples generated from laboratory strains (with different combinations, proportions, concentrations, and sample types) were tested.</p></sec><sec id="Sec11"><title>Testing with field samples</title><p id="Par35">Two-hundred and seventy-three smear-positive samples from 4 cross-sectional surveys in Asembo, western Kenya were used for this study [<xref ref-type="bibr" rid="CR23">23</xref>&#x02013;<xref ref-type="bibr" rid="CR25">25</xref>]. Among these, 65 samples from 1996 and 72 samples from 2001 were randomly selected in children from 6&#x000a0;months to 5&#x000a0;years, and 53 samples from 2007 and 83 samples from 2012 were used based on the availability of samples in individuals from 6&#x000a0;months to 20&#x000a0;years. Since age is known not to affect parasite diversity or MOI measurements when neutral markers, such as microsatellites and 24 SNP barcode, are used [<xref ref-type="bibr" rid="CR20">20</xref>, <xref ref-type="bibr" rid="CR23">23</xref>, <xref ref-type="bibr" rid="CR26">26</xref>], it was expected that the 24 SNP barcode-based MOI measured in this study is also unlikely to be influenced by age [<xref ref-type="bibr" rid="CR20">20</xref>, <xref ref-type="bibr" rid="CR26">26</xref>]. The field samples were tested in the same multiplex PCR and NGS conditions as the laboratory-cultured mixed parasite strains with or without the pre-amplifications. Laboratory-cultured strains as positive controls and normal blood as negative control were used for each experimental run for the field samples.</p></sec><sec id="Sec12"><title>MiSeq library preparation and run</title><p id="Par36">MiSeq Libraries were prepared using a standard 16S Metagenomics sequencing protocol (Illumina) [<xref ref-type="bibr" rid="CR22">22</xref>]. Briefly, the PCRs for attaching sequencing indexes and adapters were performed with the Nextera XT Index Primers with dual index barcodes followed by a cleaning process. The final libraries were analysed on Fragment Analyzer for size and concentration (ng/&#x000b5;l). The libraries for all the samples were normalized to 10&#x000a0;nM by diluting in elution buffer and pooled at equal volume. The final 10&#x000a0;nM pool was diluted to 2&#x000a0;nM and denatured and diluted for loading on the Miseq flowcell. Up to 54 barcoded samples were pooled in one library (per plate) using Illumina 2&#x02009;&#x000d7;&#x02009;250&#x000a0;bp run. On completion, sequence reads were filtered for read quality, base called and demultiplexed using bcl2fastq (v2.19). The sequencing results were saved for bioinformatics analysis.</p></sec></sec><sec id="Sec13"><title>Development of bioinformatics analysis pipeline for processing NGS data</title><p id="Par37">The bioinformatics analysis pipeline developed in this study was to clean NGS data for consistent frequency calls at each SNP site across 12 chromosomes. Because 24 SNP barcodes differ from 16S metagenomics data in depth of coverage and amplicon diversity, an existing 16S bioinformatics pipeline was modified, and several additional data cleaning components were integrated as described below. This novel pathway, called B4Screening, involves four steps: initial adaptor trimming and cleaning, Bioconductor dada2 pathway, targeted removal of mismatched primers or probes, and a random forest classification of remaining reads. All steps are detailed below.</p><sec id="Sec14"><title>Data processing</title><p id="Par38">Sequences were first trimmed to eliminate Illumina adaptor primers by CutAdapt 1.8 and cleaned by Prinseq 0.20.3. Quality was examined via FaQCs 1.34. Following CutAdapt and Prinseq, sequences were loaded into R to use a modified Bioconductor 16S pipeline. Dada2 (v1.6.0) was used for secondary filtering and trimming, error rate detection, pairing, and chimera removal. Following processing in the dada2 pipeline to remove bimeras, a second chimera removal step was performed which required that forward and reverse primers matched the same target location. The first and last 15 base pairs of each read were compared to the lookup table of potential primers, and the best matching primer (maximum of 2 mismatches) was identified as the input primer. Reads were reverse complimented and those reads without a matching forward and reverse primer pair were removed. Matching to initial probe location from the publication [<xref ref-type="bibr" rid="CR19">19</xref>] was used to evaluate whether reads matched the reference or alternative base at the SNP site. If neither reference nor alternative SNP (as designated binary) matched, the reads were removed. For each amplicon, reference and alternative probe sequences were trimmed to the identical length, and number of mismatches was calculated for each probe. Amplicons that had more than two mismatches to both probes were discarded as a spurious amplicon. Because the database included SNPs only, amplicon length variations greater than one base pair <italic>versus</italic> expected reference length were removed. The low frequency variants (&#x0003c;&#x02009;0.1% of the total sequences) were also removed.</p></sec><sec id="Sec15"><title>Matching to reference sequences</title><p id="Par39">The Bioconductor pipeline was designed for 16S data, therefore, the assignment of genus and species was used as proxy for amplicons matching the correct targets in the genome. Each region was entered as part of a reference library. Genus level assignment was provided via assignTaxonomy with a minimum bootstrap value of 80%. Reference amplicon sequences from the <italic>P. falciparum</italic> 3D7 were identified as exact matches for &#x0201c;species&#x0201d; assignment in dada2 assignSpecies. A phyloseq object was produced (Phyloseq version 1.22.3) incorporating SNP site and other relevant sample information and unique amplicons for all possible variants in the dataset prior to removal of low frequency amplicons.</p></sec><sec id="Sec16"><title>Designing a random forest (RF) machine learning classifier</title><p id="Par40">To address the amplicons generated by multiplex processing in data in which sample composition is not known, a classifier was trained on 3/4 portion of the data and tested on the remaining quarter. This classifier was generated in caret (v 6.0.84 in R) using a random forest (RF) design. The classifier was provided with 4 elements: (1) proportion of all unique amplicons by sample; (2) proportion of all unique amplicons by plate; (3) reads with SNP by plate; and, (4) sample reads per plate, all of which should be reflective of unique amplicon distribution in field data. Repeated tenfold cross validation (10, repeat 3) and features were evaluated in both the laboratory strain dataset and datasets in which sample characteristics are unknown. As a test, the classifier was also trained on two combinations and evaluated on a third for comparison of unrelated data.</p><p id="Par41">Initial amplicon training labels were generated based on mixes of laboratory strains with known SNP locations. Training labels of &#x0201c;positive&#x0201d; and &#x0201c;negative&#x0201d; were based on three criteria. For a positive label: (1) strains had to be present in more than 12 samples of the 36 samples representing each combination; (2) strains had to be more than 0.1% of the data at that SNP location; and, (3) strains had to change frequency with changing proportions of strains in models accounting for expected frequency ratio with MiSeq run as a random effect (using lmer in R). Failing to meet any of the above criteria resulted in a negative label.</p></sec><sec id="Sec17"><title>Read depth and frequency determination</title><p id="Par42">Following removal of all spurious amplicons, only samples with at least 16 SNP sites represented with a read depth of at least 500 for each SNP were retained for frequency estimation. Read frequency for reference (REF) versus alternative (ALT) alleles was calculated by aggregating all confirmed amplicon sequence variants within a genus designation by target SNP. To determine the influence of preprocessing and parasite density on reads per locus, log<sub>10</sub> reads per locus were modelled using random effects mixed models in R (lmer). Formula: ReadsPerLocus&#x02009;=&#x02009;ParasiteConcentration&#x02009;+&#x02009;PreAmplification&#x02009;+&#x02009;Proportions&#x02009;+&#x02009;Mixture&#x02009;+&#x02009;Plate, with a random intercept: Sample.</p></sec><sec id="Sec18"><title>Consistency of read frequencies</title><p id="Par43">To get initial point estimates for SNP read frequencies in each experimental combination-mixture pairing, restricted regression (R package restriktor) was performed per unique experimental combination. Frequency calls within each sample were analysed for impact of pre-amplification or parasite concentration while controlling for unique biologic mixture (combination, proportion, sample type) using a linear model.</p></sec></sec><sec id="Sec19"><title>Strain reconstruction and haplotype analysis</title><p id="Par44">A previously developed mathematical algorithm, StrainRecon [<xref ref-type="bibr" rid="CR17">17</xref>], was used for identifying haplotypes and enumerating strains based on frequency results of individual 24-SNP barcodes generated from NGS. Briefly, the basic StrainRecon algorithm accepts a vector <inline-formula id="IEq1"><alternatives><tex-math id="M1">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$${\varvec{s}}$$\end{document}</tex-math><mml:math id="M2"><mml:mrow><mml:mi mathvariant="bold-italic">s</mml:mi></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq1.gif"/></alternatives></inline-formula> of SNP frequencies as input and a parameter <bold><italic>n</italic></bold> denoting the anticipated number of strains. The output is twofold: a binary matrix <bold><italic>M</italic></bold> containing <bold><italic>n</italic></bold> reconstructed strain barcodes, and a vector <bold><italic>v</italic></bold> of strain mixtures. The strains found by the algorithm represent the maximum likelihood estimate of the original barcodes under three assumptions. First, the overall error from all steps of the pipeline can be modelled as normally distributed (Gaussian) noise. Second, noise values are independent between SNP sites. Third, the metric minimized to optimize the solution quality is <inline-formula id="IEq2"><alternatives><tex-math id="M3">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$Mv - s_{2}$$\end{document}</tex-math><mml:math id="M4"><mml:mrow><mml:mi>M</mml:mi><mml:mi>v</mml:mi><mml:mo>-</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq2.gif"/></alternatives></inline-formula>, called the <italic>misfit</italic>. (Here, <inline-formula id="IEq3"><alternatives><tex-math id="M5">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$\left\| \cdot \right\|_{2}$$\end{document}</tex-math><mml:math id="M6"><mml:msub><mml:mfenced close="&#x02225;" open="&#x02225;"><mml:mo>&#x000b7;</mml:mo></mml:mfenced><mml:mn>2</mml:mn></mml:msub></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq3.gif"/></alternatives></inline-formula> denotes the standard Euclidean L2-norm.) Both <bold><italic>M</italic></bold> and <bold><italic>v</italic></bold> are assumed to be completely unknown, making this a mathematical inverse problem. StrainRecon models the problem as a Bayesian maximum-a-posteriori (MAP) estimation problem, using block coordinate descent to quickly converge at a solution when the number of strains <italic>n</italic> is small. The algorithm further provides an average (and standard deviation) over the entire posterior distribution of candidate solutions, which can be used to illustrate confidence or ambiguity of each call the algorithm makes within a barcode or strain mixture fraction. In this paper, the StrainPycon 1.0 [<xref ref-type="bibr" rid="CR27">27</xref>] implementation of the StrainRecon algorithm for Python 3.4.3 was used for analysis. Each sample was evaluated for barcode sequences given the assumption that field samples can include up to and including 6 strains.</p></sec><sec id="Sec20"><title>StrainRecon thresholding for infection multiplicity (STIM) algorithm</title><p id="Par45">StrainRecon allows discrimination of the pattern of haplotypes that are sufficiently prevalent in a sample relative to the experimental noise, as measured by misfit. The goal of this study was to create a tool that can further estimate the true number of parasite strain infections in a sample without the need for templates while being aware that extremely low-proportion strains in the sample are indistinguishable from noise. The method, therefore, takes each sample and runs the StrainRecon algorithm in a loop over number of strains <italic>n</italic> to determine the misfit of the MAP solution for each <italic>n</italic>. As the number of free parameters increases with <italic>n</italic>, the largest misfit will be found with <italic>n</italic>&#x02009;=&#x02009;1, and the misfit decreases with larger <italic>n</italic>. In lay terms, it will always be possible to better fit the strains of a sample by imagining that it contained more haplotypes, but some of these haplotypes may be an artifact of introduced variation inherent in NGS sequencing. The approach used in this study for determining MOI was to set a threshold value, called <italic>T</italic>, on the misfit value and to return the lowest <italic>n</italic> whose MAP estimate had misfit below <italic>T</italic>. Together, the complete procedure is referred to as the STIM algorithm.</p><p id="Par46">The upper bound of <italic>n</italic> is limited by noise levels in the analytical pipeline. In practice, when error rates are in the range of 1&#x02013;3%, the upper bound of <italic>n</italic> that can be meaningfully processed is about 5&#x02013;6 strains. Specifically, even under optimistic assumptions about mixture proportions for strain disambiguation (which is when mixture proportions are proportional to the first powers of 2), an inverse problem barring an informative Bayesian prior or a tailored noise model will be unable to differentiate more than <inline-formula id="IEq4"><alternatives><tex-math id="M7">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n = - \log_{2} \left( \varepsilon \right)$$\end{document}</tex-math><mml:math id="M8"><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:msub><mml:mo>log</mml:mo><mml:mn>2</mml:mn></mml:msub><mml:mfenced close=")" open="("><mml:mi>&#x003b5;</mml:mi></mml:mfenced></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq4.gif"/></alternatives></inline-formula> strains from noise level of <inline-formula id="IEq5"><alternatives><tex-math id="M9">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$\varepsilon &#x0003e; 0$$\end{document}</tex-math><mml:math id="M10"><mml:mrow><mml:mi>&#x003b5;</mml:mi><mml:mo>&#x0003e;</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq5.gif"/></alternatives></inline-formula>. For example, at 1% error, this bound is <inline-formula id="IEq6"><alternatives><tex-math id="M11">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n \le - \log_{2} \left( {0.01} \right) \approx 6.6$$\end{document}</tex-math><mml:math id="M12"><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02264;</mml:mo><mml:mo>-</mml:mo><mml:msub><mml:mo>log</mml:mo><mml:mn>2</mml:mn></mml:msub><mml:mfenced close=")" open="("><mml:mrow><mml:mn>0.01</mml:mn></mml:mrow></mml:mfenced><mml:mo>&#x02248;</mml:mo><mml:mn>6.6</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq6.gif"/></alternatives></inline-formula>. Within these constraints, the STIM method has the crucial advantage of estimating MOI while not depending on any template information, unlike DEploid [<xref ref-type="bibr" rid="CR18">18</xref>], for example.</p></sec></sec><sec id="Sec21"><title>Results</title><sec id="Sec22"><title>Sequencing coverage in laboratory strain samples</title><p id="Par47">Of the 23.48&#x000a0;million Prinseq-cleaned, paired reads in the 108 laboratory-generated samples, 21.47&#x000a0;million reads remained prior to evaluating with the RF classifier (see next subsection for development of RF classifier). The number of unique amplicons assigned (across all 24 loci) in the laboratory strain dataset dropped from 9901 following the standard dada2 pipeline to 360 following additional filtering for primer mismatch and probe mismatch. Potential unique amplicon numbers per SNP target location in this screened set were varied, with a minimum of 2 (SNP14, SNP18) and a maximum of 70 (SNP11).</p><p id="Par48">Of these 360 unique amplicons, 51 were retained following RF classification, and represented the expected SNPs at the target locations (Fig.&#x000a0;<xref rid="Fig1" ref-type="fig">1</xref>) and several additional SNPs at different locations in the amplicon. The final number of unique amplicons per SNP target location ranged from 1 to 3 in the laboratory strain data. Overall, 21.35&#x000a0;million (90.9%) reads remained following exclusion of spurious amplicons via the RF classifier (see Additional file <xref rid="MOESM1" ref-type="media">1</xref>: Fig. S1). For each location, unique amplicons retained after RF classification were summed by target SNPs to generate frequencies for StrainRecon.<fig id="Fig1"><label>Fig. 1</label><caption><p>Distribution of unique amplicons in laboratory strains. Red bar represents number of unique amplicons following dada2 Bioconductor processing steps of B4Screening and additional matching-based steps, and green bar shows number of unique amplicons following RF classifier B4Screening steps, separated by SNP target location</p></caption><graphic xlink:href="12936_2021_3624_Fig1_HTML" id="MO1"/></fig></p></sec><sec id="Sec23"><title>RF machine learning classifier development using laboratory strains</title><p id="Par49">There were 5193 unique datapoints for classifier development in the laboratory strain dataset, accounting for 2576 of the 2592 potential SNP sites among all 108 samples (not all SNPs had&#x02009;&#x0003e;&#x02009;500 reads in each sample). If the classifier was trained with 75% of the total data selected, 3881 data values were used for training and 1312 for testing.</p><p id="Par50">In the 1312 test set, all 1161 positively coded unique amplicons were correctly identified and all 151 negative unique amplicons as initially coded were also correctly identified. Alternatively, if a single mixed combination C was withheld as a test set rather than a random selection of unique amplicons across all data, to protect against inadvertent data leakage, 3369 data-samples combinations were used for training and 1824 for testing. In this case 1587 out of 1624 positively coded unique amplicons were correctly classified (sensitivity: 97.7%) while 197 of 200 (specificity: 98.5%) negatively coded unique amplicons were correctly classified. Overall, the amplicons classified for removal were low copy number relative to the total representation of the target region (see Additional file <xref rid="MOESM1" ref-type="media">1</xref>: Fig. S1).</p><p id="Par51">When the field samples from western Kenya were processed, the same RF classifier was used to screen viable amplicons.</p></sec><sec id="Sec24"><title>Read depth and coverage of target SNP sites in laboratory strains</title><p id="Par52">The proportion of samples retaining sufficient read depth across all SNP locations following RF classification was evaluated. A cut-off of 500 reads was selected to ensure that precision would not be influenced by read depth. With a minimum coverage depth of 500 reads post-processing, SNP 16 was absent in 16 of laboratory samples (15%). No laboratory strain samples had fewer than 16 total SNPs represented, and while the fifth percentile number of SNPs present within a sample was 21, 78 of 108 samples had all 24 SNPs present. With a cut-off of 500 reads, individual SNP read depth was lowest for SNP18 (2090 median read depth), and highest for SNP10 (17,630 median read depth) (see Additional file <xref rid="MOESM2" ref-type="media">2</xref>: Fig. S2).</p></sec><sec id="Sec25"><title>Relationship between read depth and sample characteristics</title><p id="Par53">Multivariable models assessed the relationship between sample characteristics and read depth. Neither proportion of strains in a mixture (p&#x02009;=&#x02009;0.43), nor specific strain mixture (p&#x02009;=&#x02009;0.67) was associated with read depth (Fig.&#x000a0;<xref rid="Fig2" ref-type="fig">2</xref>a). Figure&#x000a0;<xref rid="Fig2" ref-type="fig">2</xref>a also showed a low variability among three different processing pipelines. Parasite concentration was non-linearly associated with average read depth [(10<sup>2</sup>&#x02009;=&#x02009;5500, 10<sup>3</sup>&#x02009;=&#x02009;6170, 10<sup>4</sup>&#x02009;=&#x02009;8310, 10<sup>5</sup>&#x02009;=&#x02009;8310 reads/site), p&#x02009;&#x0003c;&#x02009;0.0001]. Samples with 10<sup>2</sup> and 10<sup>3</sup> parasite concentration did not differ from each other, nor did samples with 10<sup>4</sup> versus 10<sup>5</sup> parasite concentrations. However, comparisons among other pairs were significant (emmeans, Tukey correction, p&#x02009;&#x0003c;&#x02009;0.01) (Fig.&#x000a0;<xref rid="Fig2" ref-type="fig">2</xref>b). Read depth did not significantly differ between pre-amplified and non-pre-amplified samples [(6460 vs. 7410 average reads/site), p&#x02009;=&#x02009;0.053] (Fig.&#x000a0;<xref rid="Fig2" ref-type="fig">2</xref>b). Both DBS and frozen blood plates yielded sufficient reads in each SNP location.<fig id="Fig2"><label>Fig. 2</label><caption><p>Relationship between read depth and sample characteristics in laboratory strain mixes. <bold>a</bold> Reflects log<sub>10</sub> total reads by plates that contain 3 different strain mixtures A: D10/D6/V1-S, B: D6/RO33/W2, C: 7G8/V1-S/RO33 respectively, as cited in <xref rid="Sec2" ref-type="sec">Methods</xref>. <bold>b</bold> Reflects impact of pre-amplification (PA) and non-pre-amplification (NOPA), separated by parasite concentration as not all parasite concentration/pre-amplification combinations exist in the data. The groups that are significantly different in read depth by original parasite concentration are indicated by &#x0201c;g1&#x0201d;and &#x0201c;g2&#x0201d; symbol, respectively</p></caption><graphic xlink:href="12936_2021_3624_Fig2_HTML" id="MO2"/></fig></p></sec><sec id="Sec26"><title>Consistency of read frequencies in laboratory strain samples</title><p id="Par54">SNP frequency calls for unique strains must be consistent within a single sample to ensure consistent barcode calls. Boxplots for frequencies across all parasite concentrations, pre-amplification and sample types (Fig.&#x000a0;<xref rid="Fig3" ref-type="fig">3</xref>) show the full range and central tendency for unique-strain SNP frequencies across all conditions in laboratory strain data. A single site (SNP23) in Combination A mix was excluded from this analysis as the high proportion template appears to have both potential SNPs present in the original source material (see Additional file <xref rid="MOESM3" ref-type="media">3</xref>: Fig. S3).<fig id="Fig3"><label>Fig. 3</label><caption><p>Frequencies of unique SNPs across all parasite concentrations, pre-amplification and sample types from laboratory strain mixes. Red is values with no unique strains at given SNP location, green is where the highest proportion strain is unique, teal where the intermediate proportion strain is unique and purple where the lowest proportion strain is unique. The estimated SNP frequency for each strain as generated in restriktor is represented in black dot. Combinations (A, B, C) were listed with proportions (1, 2, 3) as cited in <xref rid="Sec2" ref-type="sec">Methods</xref></p></caption><graphic xlink:href="12936_2021_3624_Fig3_HTML" id="MO3"/></fig></p><p id="Par55">Within a sample, a restricted regression linear model for proportion of reads matching the 3D7 reference sequence was fit with all three strains and categories (low, medium, high proportion), constrained to 1 (l&#x000a0;m, and restriktor packages in R) to assess consistency across samples prior to running strain reconstruction. Unique strain frequencies were not influenced by pre-amplification (p&#x02009;=&#x02009;1), or parasite concentration (p&#x02009;=&#x02009;1), but differed by combination and strain proportion category (low/intermediate/high).</p></sec><sec id="Sec27"><title>Reconstruction of strain barcodes using laboratory strain mixes</title><p id="Par56">Using SNP frequency calls, StrainRecon&#x02019;s MAP estimates of 108 laboratory strain samples were calculated, varying the parameter <italic>n</italic> of the number of strains to be reconstructed (anticipated MOI) between 1 and 5. The reconstructed barcodes for each sample were compared to ground truth barcodes intended to be mixed, counting the number of reference/alternate matches at SNP sites with read depth of at least 500.</p><p id="Par57">The 72 samples from Combinations B and C contain three strains. The dominant strain in the 36 samples from Combination A consistently generated two distinct reads at SNP23 (see Additional file <xref rid="MOESM3" ref-type="media">3</xref>: Fig. S3), indicating a likely presence of mutation. These two distinct reads at SNP23 were reconstructed independently, and Combination A samples were therefore characterized as containing four strains (33% of samples).</p><p id="Par58">Table <xref rid="Tab1" ref-type="table">1</xref> shows that over 98.5% of the base pairs in the dominant &#x0201c;high&#x0201d; strains (target proportion range of 88&#x02013;97.5%) were identified correctly, even when the algorithm is configured to identify more strains (<inline-formula id="IEq7"><alternatives><tex-math id="M13">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n \le 5$$\end{document}</tex-math><mml:math id="M14"><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x02264;</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq7.gif"/></alternatives></inline-formula>) than are truly mixed in the samples [<xref ref-type="bibr" rid="CR17">17</xref>]. The &#x0201c;intermediate&#x0201d; strains (target proportion range of 2&#x02013;10%) were reconstructed at 77.5&#x02013;89.3% accuracy, and the &#x0201c;low&#x0201d; strains in the mixtures (target proportion range of 0.5&#x02013;2%) were reconstructed at 47.1&#x02013;64.6% accuracy. Among these, when using parameter n&#x02009;=&#x02009;3 (true mixtures in samples), the accuracy for &#x0201c;high&#x0201d; strains, &#x0201c;intermediate&#x0201d; strains, and &#x0201c;low&#x0201d; strains were 98.7, 89.3 and 64.6%, respectively. Total DNA template material affected the ability to reconstruct barcodes, with percentage of correct SNP calls in the three true strains at 75.0% among all mixtures at 10<sup>2</sup>&#x000a0;parasites/&#x000b5;l, 80.6% at 10<sup>3</sup>&#x000a0;parasites/&#x000b5;l, 88.4% at 10<sup>4</sup>&#x000a0;parasites/&#x000b5;l, and 90.6% at 10<sup>5</sup>&#x000a0;parasites/&#x000b5;l.<table-wrap id="Tab1"><label>Table 1</label><caption><p>Barcode reconstruction quality with StrainRecon, varying the number of strains to be found</p></caption><table frame="hsides" rules="groups"><thead><tr><th align="left">Proportion of SNP sites correctly reconstructed</th><th align="left">Strain 1 (%)</th><th align="left">Strain 2 (%)</th><th align="left">Strain 3 (%)</th><th align="left">Strain 4<sup>a</sup> (%)</th></tr></thead><tbody><tr><td align="left">Anticipated MOI n&#x02009;=&#x02009;1</td><td char="." align="char">98.97</td><td char="." align="char"/><td char="." align="char"/><td char="." align="char"/></tr><tr><td align="left">Anticipated MOI n&#x02009;=&#x02009;2</td><td char="." align="char">98.97</td><td char="." align="char">84.79</td><td char="." align="char"/><td char="." align="char"/></tr><tr><td align="left">Anticipated MOI n&#x02009;=&#x02009;3</td><td char="." align="char">98.70</td><td char="." align="char">89.25</td><td char="." align="char">64.64</td><td char="." align="char"/></tr><tr><td align="left">Anticipated MOI n&#x02009;=&#x02009;4</td><td char="." align="char">98.78</td><td char="." align="char">82.69</td><td char="." align="char">48.80</td><td char="." align="char">19.79</td></tr><tr><td align="left">Anticipated MOI n&#x02009;=&#x02009;5</td><td char="." align="char">98.50</td><td char="." align="char">77.52</td><td char="." align="char">47.06</td><td char="." align="char">19.28</td></tr></tbody></table><table-wrap-foot><p>Columns are arranged by proportion of strain in the mixture, so the most dominant strain is called Strain 1. Non-barcode SNP sites are omitted</p><p><sup>a</sup>Strain 4 exists only in those samples where SNP23 had a mutation</p></table-wrap-foot></table-wrap></p><p id="Par59">It is noted that larger <italic>n</italic> provides more free parameters, thus allowing solutions to have a better fit to the SNP frequency vector than smaller <italic>n</italic>, even when <italic>n</italic> exceeds the true number of strains in a sample. In this latter case, the algorithm &#x0201c;overfits&#x0201d; and attempt to explain experimental noise in the data with extra low-frequency strains to score a lower misfit value (see Additional file <xref rid="MOESM4" ref-type="media">4</xref>: Fig. S4 and Additional file <xref rid="MOESM5" ref-type="media">5</xref>: Fig. S5 with detailed information using individual sample examples). However, the third strain (and mutated fourth strain for Combination A samples) in all mixtures had very small proportion, even less than the estimated noise level of the experiment (&#x02264;&#x02009;5%), which deteriorates the barcode reconstruction.</p></sec><sec id="Sec28"><title>StrainRecon thresholding for infection multiplicity (STIM) algorithm</title><p id="Par60">Recall that the StrainRecon method takes the number of anticipated strains <italic>n</italic> as input and then determines what barcodes and mixture vectors of <italic>n</italic> strains produce the best fit (lowest misfit) by minimizing noise. A natural approach for estimating the true number of strains is then to place a threshold on the acceptable fitness level and determine how many strains <italic>n</italic> StrainRecon needs to produce barcodes and mixture vector with sufficiently low misfit. This algorithm is called STIM. The STIM estimation method was first evaluated on synthetic data produced by generating 1000 barcode matrices <bold><italic>M</italic></bold> and mixture vectors <bold><italic>v</italic></bold> uniformly at random for each specific number of strains <italic>n</italic> and calculating the dot-product <bold><italic>Mv</italic></bold>. Following the StrainRecon framework [<xref ref-type="bibr" rid="CR17">17</xref>], Gaussian noise with a known standard deviation <inline-formula id="IEq8"><alternatives><tex-math id="M15">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$${\upgamma }$$\end{document}</tex-math><mml:math id="M16"><mml:mi mathvariant="normal">&#x003b3;</mml:mi></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq8.gif"/></alternatives></inline-formula>, corresponding to the amalgam of errors and reads in the laboratory pipeline, is added to the dot-product before the resulting vector is reconstructed. Figure&#x000a0;<xref rid="Fig4" ref-type="fig">4</xref> (left) varies the threshold value <italic>T</italic> on the horizontal axis, showing that lower noise levels give rise to misfit thresholds where MOI can be discerned for a wider range of strain counts. For comparison, Fig.&#x000a0;<xref rid="Fig4" ref-type="fig">4</xref> (right) shows how well STIM performs when, mathematically, the proportion of each strain in the mixture (vector <bold><italic>v</italic></bold>) is ideally separated from others (by being proportional to powers of 2 as can be shown through mathematical analysis). The noise on the mixture values is normally distributed around a mean of 0 with standard deviation <inline-formula id="IEq9"><alternatives><tex-math id="M17">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$${\upgamma }$$\end{document}</tex-math><mml:math id="M18"><mml:mi mathvariant="normal">&#x003b3;</mml:mi></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq9.gif"/></alternatives></inline-formula> of either 0.05 (top row) or 0.01 (bottom row). Since normally distributed variables fall within two standard deviations from the mean 95% of the time, these <inline-formula id="IEq10"><alternatives><tex-math id="M19">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$${\upgamma }$$\end{document}</tex-math><mml:math id="M20"><mml:mi mathvariant="normal">&#x003b3;</mml:mi></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq10.gif"/></alternatives></inline-formula> values correspond to hypothetical pipelines whose accuracy for SNP reads are expected to lie with within 10 and 2%, respectively, of their true values at least 95% of the time. The black vertical bar shows the threshold value suggested by Morozov&#x02019;s discrepancy principle, a point below which data are effectively best explained by noise. The results show that STIM is sensitive to noise levels: lower underlying error rate (standard deviation of 0.01) allows differentiation of strain count between 1 and 5 at some thresholds, whereas high underlying noise (standard deviation of 0.05) is more restrictive.<fig id="Fig4"><label>Fig. 4</label><caption><p>STIM estimation method on synthetic data. The data was generated by creating uniformly random barcodes and mixture vectors and adding Gaussian noise (top: std.dev&#x02009;=&#x02009;0.05, bottom: std.dev&#x02009;=&#x02009;0.01) to create input vectors for StrainRecon. The graph shows the percentage of synthetic samples for which the MOI was correctly identified by choosing the lowest number of strains that go below the misfit threshold on the horizontal axis. The plots on the left show regular performance, whereas on the right StrainRecon is provided with the correct value of the mixture vector. The black vertical lines show lowest misfit threshold permitted by Morozov&#x02019;s discrepancy principle (here the expression simplifies to <inline-formula id="IEq22"><alternatives><tex-math id="M21">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$\frac{SNPs}{2} = 12$$\end{document}</tex-math><mml:math id="M22"><mml:mrow><mml:mfrac><mml:mrow><mml:mi mathvariant="italic">SNPs</mml:mi></mml:mrow><mml:mn>2</mml:mn></mml:mfrac><mml:mo>=</mml:mo><mml:mn>12</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq22.gif"/></alternatives></inline-formula>), below which any signal is effectively better explained by noise [<xref ref-type="bibr" rid="CR17">17</xref>]</p></caption><graphic xlink:href="12936_2021_3624_Fig4_HTML" id="MO4"/></fig></p></sec><sec id="Sec29"><title>Calibrating the STIM threshold</title><p id="Par61">The assumption of Gaussian noise works well for in silico, but a rigorous model for experimental noise in practice is lacking. To adapt STIM for field data the first step was to calculate the MOI threshold <italic>T</italic> for laboratory strain mixture data (108 samples) while simultaneously calibrating MOI values that are analytically predetermined. To tolerate error in the pipeline and the noise-to-signal uncertainty in StrainRecon, the threshold <italic>T</italic> was varied while conservatively measuring whether samples from the laboratory strain experiment were called at <inline-formula id="IEq11"><alternatives><tex-math id="M23">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n = 3 \pm 1$$\end{document}</tex-math><mml:math id="M24"><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x000b1;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq11.gif"/></alternatives></inline-formula> strains. The case for <inline-formula id="IEq12"><alternatives><tex-math id="M25">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n = 2$$\end{document}</tex-math><mml:math id="M26"><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq12.gif"/></alternatives></inline-formula> accounts for mixture proportions that were so low (0.5&#x02013;1%) relative to noise so that the corresponding samples effectively have two strains. Conversely, <inline-formula id="IEq13"><alternatives><tex-math id="M27">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n = 4$$\end{document}</tex-math><mml:math id="M28"><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq13.gif"/></alternatives></inline-formula> accommodates the Combination A samples in mixtures that had a mutation in SNP23 in one of the laboratory strains, thus creating 4-strain combinations (Additional file <xref rid="MOESM3" ref-type="media">3</xref>: Fig. S3). In STIM, the calibrated threshold of <inline-formula id="IEq14"><alternatives><tex-math id="M29">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$T = 1.8 \times 10^{ - 7}$$\end{document}</tex-math><mml:math id="M30"><mml:mrow><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>1.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq14.gif"/></alternatives></inline-formula> provided MOI within <inline-formula id="IEq15"><alternatives><tex-math id="M31">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$n = 3 \pm 1$$\end{document}</tex-math><mml:math id="M32"><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>3</mml:mn><mml:mo>&#x000b1;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq15.gif"/></alternatives></inline-formula> strains for 101 out of 108 laboratory-mixed samples (93.6%). Overestimation (n&#x02009;=&#x02009;5) was observed in 7 out of 108 laboratory samples (7.4%). The results are delineated in Table <xref rid="Tab2" ref-type="table">2</xref>. To account for any potential inaccuracy in the threshold choice determination, the final step evaluated the field results across a range of threshold values (see Fig.&#x000a0;<xref rid="Fig6" ref-type="fig">6</xref>).<table-wrap id="Tab2"><label>Table 2</label><caption><p>Distribution of MOI estimates by STIM on 108 laboratory-mixed samples using the threshold <inline-formula id="IEq25"><alternatives><tex-math id="M33">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$T = 1.8 \times 10^{ - 7}$$\end{document}</tex-math><mml:math id="M34"><mml:mrow><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>1.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq25.gif"/></alternatives></inline-formula></p></caption><table frame="hsides" rules="groups"><thead><tr><th align="left">True MOI</th><th align="left">1</th><th align="left">2</th><th align="left">3</th><th align="left">4</th><th align="left">5</th></tr></thead><tbody><tr><td align="left">n&#x02009;=&#x02009;4 (Comb. A)</td><td char="." align="char">0</td><td char="." align="char">3</td><td char="." align="char">13</td><td char="." align="char"><italic>17</italic></td><td char="." align="char">3</td></tr><tr><td align="left">n&#x02009;=&#x02009;3 (Comb. B)</td><td char="." align="char">0</td><td char="." align="char">4</td><td char="." align="char"><italic>21</italic></td><td char="." align="char">11</td><td char="." align="char">0</td></tr><tr><td align="left">n&#x02009;=&#x02009;3 (Comb. C)</td><td char="." align="char">0</td><td char="." align="char">3</td><td char="." align="char"><italic>15</italic></td><td char="." align="char">14</td><td char="." align="char">4</td></tr></tbody></table><table-wrap-foot><p>Estimates for the true MOI value are italicized</p></table-wrap-foot></table-wrap></p></sec><sec id="Sec30"><title>MOI in Kenyan field samples</title><p id="Par62">The StrainRecon and STIM algorithms with a threshold of <inline-formula id="IEq16"><alternatives><tex-math id="M35">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$T = 1.8 \times 10^{ - 7}$$\end{document}</tex-math><mml:math id="M36"><mml:mrow><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>1.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq16.gif"/></alternatives></inline-formula> were used to evaluate field samples from 4 cross-sectional surveys conducted in Kenya. A Kruskal&#x02013;Wallis <italic>H</italic>-test (non-parametric one-way analysis of variance that extends the Mann&#x02013;Whitney U test) on the MOI differing among smear-positive samples from 1996, 2001, 2007, and 2012 surveys was significant (<inline-formula id="IEq17"><alternatives><tex-math id="M37">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$H = 29.7$$\end{document}</tex-math><mml:math id="M38"><mml:mrow><mml:mi>H</mml:mi><mml:mo>=</mml:mo><mml:mn>29.7</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq17.gif"/></alternatives></inline-formula>, <inline-formula id="IEq18"><alternatives><tex-math id="M39">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$p &#x0003c; 2 \times 10^{ - 6}$$\end{document}</tex-math><mml:math id="M40"><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x0003c;</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>6</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq18.gif"/></alternatives></inline-formula>). The result suggests that the distribution of MOI in at least one of the years stochastically dominated another.</p><p id="Par63">To determine the differences in MOI between years, the Conover-Imam test was carried out to compare pairwise stochastic dominance for all pairs. False discovery rate (FDR) was controlled for using the two-stage step-up method of Benjamini, Krieger, and Yekutieli (BKR), which improves the power of the well-known Benjamini and Hochberg (BH) FDR mitigation method without making additional assumption [<xref ref-type="bibr" rid="CR28">28</xref>]. The results with the FDR-corrected <italic>p</italic> values (<italic>q</italic> values) are shown in Table <xref rid="Tab3" ref-type="table">3</xref>. The comparisons showed significant decrease in MOI (<inline-formula id="IEq19"><alternatives><tex-math id="M41">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$q &#x0003c; 0.02$$\end{document}</tex-math><mml:math id="M42"><mml:mrow><mml:mi>q</mml:mi><mml:mo>&#x0003c;</mml:mo><mml:mn>0.02</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq19.gif"/></alternatives></inline-formula>) between every pair of survey years between 1996 and 2012 with the exception of 2007 and 2012 (<inline-formula id="IEq20"><alternatives><tex-math id="M43">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$q = 0.15$$\end{document}</tex-math><mml:math id="M44"><mml:mrow><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>0.15</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq20.gif"/></alternatives></inline-formula>). In best fitting results from STIM, all samples contained either 5 or fewer strains and no sample contained 6 (or more) strains above the&#x02009;5% noise level resolution of STIM. Notably, strains of low proportions (&#x02264;&#x02009;5%) in samples are indistinguishable from pipeline noise in the MOI estimates. The STIM results thus suggest a decline from an average of 4.32 strains per infected person in 1996 to 4.01, 3.56, and 3.35 in the years 2001, 2007, and 2012, respectively (Fig.&#x000a0;<xref rid="Fig5" ref-type="fig">5</xref>c). Figure&#x000a0;<xref rid="Fig5" ref-type="fig">5</xref>d also shows that the fraction of samples with one strain increased from 3% in 1996 to 17% in 2012 while the fraction of samples with 5 strains reduced from 57% in 1996 to 18% in 2012. The FDR-adjusted statistical test results for STIM are robust to changes in the misfit threshold parameter <italic>T</italic> (Fig.&#x000a0;<xref rid="Fig6" ref-type="fig">6</xref>) except between 2007 and 2012 which straddles the <inline-formula id="IEq21"><alternatives><tex-math id="M45">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$q = 0.05$$\end{document}</tex-math><mml:math id="M46"><mml:mrow><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>0.05</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq21.gif"/></alternatives></inline-formula> boundary.<table-wrap id="Tab3"><label>Table 3</label><caption><p>Difference in MOI across years in the Kenyan field samples</p></caption><table frame="hsides" rules="groups"><thead><tr><th align="left"/><th align="left">1996</th><th align="left">2001</th><th align="left">2007</th><th align="left">2012</th></tr></thead><tbody><tr><td align="left">1996</td><td align="left"/><td align="left"><inline-formula id="IEq26"><alternatives><tex-math id="M47">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.7 \times 10^{ - 2}$$\end{document}</tex-math><mml:math id="M48"><mml:mrow><mml:mn>1.7</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq26.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq27"><alternatives><tex-math id="M49">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$9.8 \times 10^{ - 5}$$\end{document}</tex-math><mml:math id="M50"><mml:mrow><mml:mn>9.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq27.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq28"><alternatives><tex-math id="M51">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$4.4 \times 10^{ - 7}$$\end{document}</tex-math><mml:math id="M52"><mml:mrow><mml:mn>4.4</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq28.gif"/></alternatives></inline-formula>*</td></tr><tr><td align="left">2001</td><td align="left"><inline-formula id="IEq29"><alternatives><tex-math id="M53">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.7 \times 10^{ - 2}$$\end{document}</tex-math><mml:math id="M54"><mml:mrow><mml:mn>1.7</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq29.gif"/></alternatives></inline-formula>*</td><td align="left"/><td align="left"><inline-formula id="IEq30"><alternatives><tex-math id="M55">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.6 \times 10^{ - 2}$$\end{document}</tex-math><mml:math id="M56"><mml:mrow><mml:mn>1.6</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq30.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq31"><alternatives><tex-math id="M57">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$7.0 \times 10^{ - 4}$$\end{document}</tex-math><mml:math id="M58"><mml:mrow><mml:mn>7.0</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq31.gif"/></alternatives></inline-formula>*</td></tr><tr><td align="left">2007</td><td align="left"><inline-formula id="IEq32"><alternatives><tex-math id="M59">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$9.8 \times 10^{ - 5}$$\end{document}</tex-math><mml:math id="M60"><mml:mrow><mml:mn>9.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>5</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq32.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq33"><alternatives><tex-math id="M61">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.6 \times 10^{ - 2}$$\end{document}</tex-math><mml:math id="M62"><mml:mrow><mml:mn>1.6</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq33.gif"/></alternatives></inline-formula>*</td><td align="left"/><td align="left"><inline-formula id="IEq34"><alternatives><tex-math id="M63">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.5 \times 10^{ - 1}$$\end{document}</tex-math><mml:math id="M64"><mml:mrow><mml:mn>1.5</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq34.gif"/></alternatives></inline-formula>*</td></tr><tr><td align="left">2012</td><td align="left"><inline-formula id="IEq35"><alternatives><tex-math id="M65">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$4.4 \times 10^{ - 7 }$$\end{document}</tex-math><mml:math id="M66"><mml:mrow><mml:mn>4.4</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq35.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq36"><alternatives><tex-math id="M67">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$7.0 \times 10^{ - 4}$$\end{document}</tex-math><mml:math id="M68"><mml:mrow><mml:mn>7.0</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq36.gif"/></alternatives></inline-formula>*</td><td align="left"><inline-formula id="IEq37"><alternatives><tex-math id="M69">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$1.5 \times 10^{ - 1}$$\end{document}</tex-math><mml:math id="M70"><mml:mrow><mml:mn>1.5</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq37.gif"/></alternatives></inline-formula></td><td align="left"/></tr></tbody></table><table-wrap-foot><p>FDR-corrected <italic>p</italic>-values (<italic>q</italic>-values) of a Conover&#x02013;Imam significance tests for difference in MOIs across all pairs of years in the Kenyan field samples. Significance at <inline-formula id="IEq38"><alternatives><tex-math id="M71">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$q = 0.05$$\end{document}</tex-math><mml:math id="M72"><mml:mrow><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>0.05</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq38.gif"/></alternatives></inline-formula> level is denoted by an asterisk (*)</p></table-wrap-foot></table-wrap><fig id="Fig5"><label>Fig. 5</label><caption><p>Metrics for assessing the change in transmission level in western Kenya. <bold>a</bold> EIR in Asembo, western Kenya from 1995 to 2012. <bold>b</bold> Malaria prevalence in age under 5&#x000a0;years old with a 95% confidence interval in same area from 1996 to 2012. <bold>c</bold> MOI as estimated by the STIM algorithm from 1996, 2001, 2007, and 2012 surveys in same area, presented as average MOI. <bold>d</bold> Percentage of samples with different number of strains in same area from 1996, 2001, 2007, and 2012 surveys. EIR data and malaria prevalence data, based on smear diagnosis, were extracted from published [<xref ref-type="bibr" rid="CR38">38</xref>] and unpublished (KEMRI/CDC) data. The StrainRecon and STIM algorithms with a threshold of <inline-formula id="IEq23"><alternatives><tex-math id="M73">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$T = 1.8 \times 10^{ - 7}$$\end{document}</tex-math><mml:math id="M74"><mml:mrow><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mn>1.8</mml:mn><mml:mo>&#x000d7;</mml:mo><mml:msup><mml:mn>10</mml:mn><mml:mrow><mml:mo>-</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq23.gif"/></alternatives></inline-formula> were run for the SNP frequency data generated from smear-positive samples from 4 surveys in Kenya for MOI estimation</p></caption><graphic xlink:href="12936_2021_3624_Fig5_HTML" id="MO6"/></fig><fig id="Fig6"><label>Fig. 6</label><caption><p>FDR-adjusted significance levels of the key statistical tests with different STIM thresholds <italic>T</italic>. Robustness across a neighborhood of values around the chosen <italic>T</italic>, shown on a log&#x02013;log scale. The dotted black vertical line is the threshold derived from laboratory data and was used to generate Fig.&#x000a0;<xref rid="Fig5" ref-type="fig">5</xref>c, d and Table <xref rid="Tab3" ref-type="table">3</xref>; the dotted blue horizontal line is the significance level of <inline-formula id="IEq24"><alternatives><tex-math id="M75">\documentclass[12pt]{minimal}
				\usepackage{amsmath}
				\usepackage{wasysym} 
				\usepackage{amsfonts} 
				\usepackage{amssymb} 
				\usepackage{amsbsy}
				\usepackage{mathrsfs}
				\usepackage{upgreek}
				\setlength{\oddsidemargin}{-69pt}
				\begin{document}$$q = 0.05$$\end{document}</tex-math><mml:math id="M76"><mml:mrow><mml:mi>q</mml:mi><mml:mo>=</mml:mo><mml:mn>0.05</mml:mn></mml:mrow></mml:math><inline-graphic xlink:href="12936_2021_3624_Article_IEq24.gif"/></alternatives></inline-formula>. The strain count was not significantly different between 2007 and 2012 in the vicinity of the MOI misfit threshold used</p></caption><graphic xlink:href="12936_2021_3624_Fig6_HTML" id="MO5"/></fig></p><p id="Par64">In addition to MOI, all barcodes with at least 5% proportion that StrainRecon reconstructed from the samples across all years were unique, 878 distinct strains in total. This finding suggests a lack of clonality in the Asembo Bay area.</p><p id="Par65">Since the surveys 2007 and 2012 comprised individuals up to 20&#x000a0;years of age, a potential influence of age on MOI was further examined. There were no significant correlations between MOI and age, comparing children of 5&#x000a0;years and below with older ages, in 2007 (q&#x02009;=&#x02009;0.07) or 2012 (q&#x02009;=&#x02009;0.24) surveys, respectively.</p><p id="Par66">In summary, the validation using field samples from Kenya suggests that the approach developed in this study could be used on the field samples for reliably reconstructing strains in individual samples and for detecting changes in MOI over time.</p></sec></sec><sec id="Sec31"><title>Discussion</title><p id="Par67">This study described the development of laboratory assays with multiplex PCRs followed by NGS, a unique bioinformatic process with B4Screening pathway and a novel threshold-calibrated MOI estimation method capable of detecting multiple-strain infections of <italic>P. falciparum</italic> parasites in artificially mixed laboratory strains and field isolates. Using this pipeline, the 24 barcode SNPs were identified successfully and uniformly from the 12 chromosomes of <italic>P. falciparum</italic> in a sample. The impact of pre-amplification, parasite concentration and strain proportion on the SNP frequency of mixed laboratory strains were evaluated. Parasite concentration within the tested range and pre-amplification did not influence the SNP frequency of strain within the same proportion, which allowed the evaluation of the field samples with a range of parasite densities with and without pre-amplification. Both DBS and frozen blood yielded sufficient reads in each SNP location analysed. Based on consistent barcode 24 SNP frequency calls at targeted locations, the algorithmic reconstruction of strains for each sample using a novel published StrainRecon [<xref ref-type="bibr" rid="CR17">17</xref>] reconstructed the barcodes of dominant strains with 98.5% accuracy. In field samples from western Kenya, up to 5 strains in a sample were identified using same tools described above and temporal changes in 24 SNP barcode-based MOI could be reliably estimated using the novel threshold-calibrated STIM method developed in this study.</p><sec id="Sec32"><title>Pipeline development</title><p id="Par68">Multiplex PCR is a fast and cost-saving approach for pathogen diagnosis and genotyping [<xref ref-type="bibr" rid="CR29">29</xref>, <xref ref-type="bibr" rid="CR30">30</xref>]. Three multiplex PCRs covering all 24 barcode SNPs identified by Daniels et al. [<xref ref-type="bibr" rid="CR19">19</xref>] within the <italic>P. falciparum</italic> genome were developed in this study. The 24 short target regions needed to be amplified with comparable efficiency in multiplex PCRs to ensure generation of sufficient SNP coverage at each location via NGS. The 3 optimal multiplex PCRs were developed through a series of optimizations and pilot testing.</p><p id="Par69">The most important and challenging issue was how to uniformly call the 24 barcode SNP frequencies based on NGS data and assign these frequencies to haplotypes. The second, interlinked, challenge was to determine how many strains were represented in each sample. To resolve these challenges, the StrainRecon mathematical algorithm for strain disambiguation [<xref ref-type="bibr" rid="CR17">17</xref>] from different chromosomes and different SNPs [<xref ref-type="bibr" rid="CR19">19</xref>] was leveraged in this study. It is important to point out that the consistency of SNP frequency calls at targeted SNP locations determines the ability of the StrainRecon and thus STIM to successfully assign these frequencies to haplotypes and disentangle the multiple strains within a sample and MOI estimation.</p><p id="Par70">To maximize the consistency of SNP frequency calls from NGS data for haplotype assignment and strain reconstruction, the unique bioinformatics pipeline with B4Screeining pathway developed in this study first removed spurious amplicons introduced by sequencing. Initial data trimming and cleaning steps, by CutAdapt and Prinseq and data quality visualization via FaQCs prior to entry into the Bioconductor pipeline, ensured only sufficient quality, trimmed reads were processed. Without proper screening of noise, the extraneous element could erroneously generate orders of magnitude more sequences than truly exist in the data. There were numerous low copy number amplicons that did not reflect true diverse amplicons removed at population level using the novel and sensitive RF classifier developed in this study.</p><p id="Par71">While this novel bioinformatics pipeline currently applies to the described 24-SNP malaria barcoding scheme, the implications of data cleaning steps here indicate the importance of careful evaluation of NGS output in any non-template-driven systems, such as whole genome multi locus sequence typing (wgMLST) or other barcoding approaches. The majority of unique amplicons (yet minority of total reads) generated in the laboratory and subsequently removed by routine bioinformatics processing did not reflect intended targets. In addition, chimerism between multiplex PCR targets was substantial in raw data. Therefore, leaving primers that are clearly distinct between sites on reads during the processing allowed a quick screen for chimeras. Incorporating this step prior to primer trimming also improves data quality and efficiency of analysis.</p><p id="Par72">In the bioinformatics pipeline, where only information regarding known specific SNP sites (binary) was incorporated, the finding of multiple distinct point mutations per amplicon was not utilized. This latter information (category) can be incorporated in future work not only to enhance 24 SNP frequency calls but also to be useful for amplicon deep sequencing data analysis.</p><p id="Par73">Although laboratory or clinical samples can have high parasite densities, field samples from population-based community surveys are collected largely from asymptomatic individuals who tend to have low parasite densities. In addition, in high and medium transmission areas, a minor proportion of parasite strains in a sample is often undetectable using conventional molecular technologies [<xref ref-type="bibr" rid="CR31">31</xref>]. Both low parasite density samples and low proportion of parasite strains in a sample could increase difficulty in producing consistent frequencies across all SNP locations due to the limited DNA template availability. In this study, the impact of pre-amplification, parasite concentration and strain proportion on consistency of 24 SNP frequency calls were evaluated. It showed that SNP frequencies were not influenced by parasite concentration within the currently tested range or by pre-amplification, but differed by strain proportion. It is easy to envision that SNP frequency calls are entirely based on original DNA template diversity when noise and spurious amplicons from NGS are minimized using the unique bioinformatics developed in this study. Most importantly, the results have practical applications. First, it allows evaluation of field samples with a range of parasite densities, with and without pre-amplification. Second, pre-amplification allows evaluation of samples that have insufficient parasite concentration for analysis of diversity, but sufficient concentration to amplify without this step. Critically, NGS is sensitive for detection of low frequency SNPs in a sample [<xref ref-type="bibr" rid="CR32">32</xref>, <xref ref-type="bibr" rid="CR33">33</xref>]. Compared to existing conventional molecular tools where minor parasite proportion below 10&#x02013;30% in a sample were generally undetectable [<xref ref-type="bibr" rid="CR9">9</xref>, <xref ref-type="bibr" rid="CR34">34</xref>&#x02013;<xref ref-type="bibr" rid="CR36">36</xref>], the tools developed in this study detected the barcodes of dominant strain with 98.5% accuracy and the proportion of parasite strains ranging from 2 to 10% in a sample with accuracy between 77.5 and 89.3%. Although the target lowest limit for minor strain detection was designed at 0.5&#x000a0;parasite/&#x000b5;l (see second subsection for laboratory assay development in <xref rid="Sec2" ref-type="sec">Methods</xref>), this study did not attain such a fine level of resolution due to the inaccuracy observed at lower proportions (0.5&#x02013;2%) of strains, specifically that as the strain proportion decreases, the ability to accurately detect the minor strain diminishes. This lower bound is influenced by both background noise level in the current analysis pipeline and the low DNA copy number of minor strains in the sample, decreasing expected precision. Nevertheless, the results from this study showed a substantial improvement in sensitivity for detecting minor parasite populations in a sample, particularly those above 5% proportion, with acceptable precision. While deep sequencing of individual targeted antigenic genes also provides ability to detect gene-specific minor variants in a sample, the estimates in the highly selected genes might not represent true genomic signatures of parasites [<xref ref-type="bibr" rid="CR14">14</xref>, <xref ref-type="bibr" rid="CR15">15</xref>, <xref ref-type="bibr" rid="CR37">37</xref>] and may offer limited temporal and geographic discrimination between parasite populations [<xref ref-type="bibr" rid="CR15">15</xref>].</p><p id="Par74">The STIM algorithm for assessing MOI based on SNP read frequencies relies crucially on noise levels. Highly variable input in a noisy pipeline may cause MOI to be overestimated by confusing noise with true signal in reconstruction. A threshold was placed on MOI misfit, and calibrated by balancing the false negative rate of the strain reconstruction quality of StrainRecon on the known laboratory samples to the false positive rate and number of strains estimated. The same threshold was then used for running STIM on field data under the assumption that most noise would be from the pipeline steps against which the threshold already accounted. Importantly, the trends and conclusions from the field continue to hold even if the true threshold for this was slightly shifted relative to the one determined by the laboratory strain setting. In other words, MOI value estimated by STIM is subject to noise, whereas temporal changes in MOI as estimated by STIM are resilient to such noise. A large study across geographic regions is ongoing to examine the robustness of the STIM method in the field as well as potential needs for further calibration of the proposed threshold with a richer set of artificial strain mixtures.</p></sec><sec id="Sec33"><title>Field results</title><p id="Par75">MOI has only recently been used as a metric for malaria transmission. Therefore, the EIR and malaria prevalence in children 5&#x000a0;years old and younger from same study area were obtained from 1995&#x02013;1996 to 2012 (Fig.&#x000a0;<xref rid="Fig5" ref-type="fig">5</xref>a, b, data extracted from both published [<xref ref-type="bibr" rid="CR38">38</xref>] and unpublished KEMRI/CDC data) for side-by-side comparison with the MOI estimated from current study (Fig.&#x000a0;<xref rid="Fig5" ref-type="fig">5</xref>c, d). The results show that the EIR sharply declined between 1995 and 2001 and remained low, even as malaria prevalence gradually decreased between 1996 and 2007, then reaching a plateau between 2007 and 2012. In comparison, the average MOI gradually declined over time and the percentage of samples with 5 strains dropped from 57% in 1996 to 18% in 2012 (during which period the proportion of one-strain samples increased). Since there was no correlation between MOI and age, the decline in MOI over time is unlikely to be confounded by host age [<xref ref-type="bibr" rid="CR20">20</xref>]. Overall, the decreases in both average MOI and proportion of samples with 5 strains over time are in tandem with the decline in EIR and malaria prevalence; but the turning points are different. Specifically, MOI shows slow reduction, EIR has a sharp decline, and malaria prevalence stagnates from 2007 to 2012. This suggests a non-linear scaling relationship among the three malaria metrics [<xref ref-type="bibr" rid="CR3">3</xref>]. The reasons behind the slow reduction in MOI are unclear; the large number of distinct strains detected in the area may play a role (878 distinct strains at least 5% proportion were detected in total and are reported in <xref rid="Sec20" ref-type="sec">Results</xref>). Nevertheless, the MOI, which provides the information of strain numbers within a host, is a higher resolution parasite index compared with the malaria prevalence index and it might represent true transmission level. A further study of the parasite strain population size and strain relatedness is needed using this dataset.</p><p id="Par76">The tools developed in this study advance both the estimation of number of strains within a host but also the number of strains at a population level, enhancing the resolution for MOI estimation. This advantage is particularly obvious compared to the original Taqman PCR 24 SNP barcode assay and COIL analysis for complexity of infection (COI) estimation in which monomorphic or polymorphic genotypes within each sample are estimated [<xref ref-type="bibr" rid="CR19">19</xref>&#x02013;<xref ref-type="bibr" rid="CR21">21</xref>]. Taken together, the combined approach established in this study could be used for MOI estimation, particularly for temporal changes in MOI in regions with medium to high transmission levels. A large-scale validation study is being conducted using samples from different malaria countries/regions with heterogeneous transmission intensity.</p></sec></sec><sec id="Sec34"><title>Conclusion</title><p id="Par77">This study demonstrated that the combined approach of new multiplex PCRs, NGS and the unique bioinformatics pipeline developed in this study, together with the previously published StrainRecon algorithm, could identify prominent 24 barcode SNPs correctly and consistently across 12 of chromosomes of <italic>P. falciparum</italic>. Coupled with the novel threshold-calibrated MOI estimation method STIM, the proposed approach in this study provides a sensitive and high-resolution MOI estimator that could be used on field samples to measure temporal changes in MOI and to provide additional parasite indices for interpreting transmissions. The combined approach can potentially be used to study MOI in malaria clinical manifestations and to evaluate impact of interventions on transmission reduction for programme purpose. The utility of StrainRecon with the 24 SNP advanced laboratory tools and unique bioinformatics pipeline deserves further exploration for distinguishing recrudescence from re-infection in drug trials. Such tools can be further geared to tracking of imported malaria cases. The pseudocode for B4Screening pathway and STIM program code are available for download online at: <ext-link ext-link-type="uri" xlink:href="https://www.ymsir.com/stim/">https://www.ymsir.com/stim/</ext-link>.</p></sec><sec sec-type="supplementary-material"><title>Supplementary Information</title><sec id="Sec35"><p>
<supplementary-material content-type="local-data" id="MOESM1"><media xlink:href="12936_2021_3624_MOESM1_ESM.pdf"><caption><p><bold>Additional file 1: Figure S1.</bold> RF classification using laboratory strains. Figure shows proportion of each unique amplicon relative to all amplicons from the targeted region on a plate from all laboratory strain samples based on RF classifier (when random selection determined the training set). X axis is the log<sub>10</sub> count of SNP-specific reads in a Miseq run and y axis is log<sub>10</sub> count of unique amplicon-specific reads in that individual run. Red points are amplicons classified as negative (False) for exclusion while green points are amplicons classified as positive (True) for further SNP frequency analysis.</p></caption></media></supplementary-material><supplementary-material content-type="local-data" id="MOESM2"><media xlink:href="12936_2021_3624_MOESM2_ESM.tif"><caption><p><bold>Additional file 2: Figure S2.</bold> Depth of coverage across loci in laboratory strains. Log<sub>10</sub> reads per SNP target location in laboratory strain data, from 1 to 24 IQR and outliers using a minimum threshold of 500 reads for a sample to be included. Green dots and right Y axis represent proportion of samples missing a value at each SNP site.</p></caption></media></supplementary-material><supplementary-material content-type="local-data" id="MOESM3"><media xlink:href="12936_2021_3624_MOESM3_ESM.pdf"><caption><p><bold>Additional file 3: Figure S3.</bold> Plot SNP23 from Combination A mixes. Plot SNP23 from Combination A mixes was made separately from all other SNPs. Data is also separated by source material (DBS and frozen blood samples). Within each mixture, left to right, the colors are: blue&#x02014;all strains identical, orange&#x02014;dominant strain unique, green&#x02014;intermediate strain unique, yellow&#x02014;low strain unique, and gray&#x02014;SNP23 in Combination A mixes. A pilot experiment was also conducted using a different culture source, and this difference at SNP23 was not observed. Based on the consistency among all other SNP sites and the difference between parasite batches, SNP23 from Combination A was excluded from the Fig.&#x000a0;<xref rid="Fig3" ref-type="fig">3</xref> analysis concerning two distinct SNP reads.</p></caption></media></supplementary-material><supplementary-material content-type="local-data" id="MOESM4"><media xlink:href="12936_2021_3624_MOESM4_ESM.docx"><caption><p><bold>Additional file 4: Figure S4.</bold> StrainRecon reconstruction on laboratory-mixed samples. Ground truth of barcodes (first row in each figure), the MAP estimate of the reconstruction matrix <bold>M</bold> and mixture vector <bold><italic>v</italic></bold> (second row), and the mean (third row) and standard deviation (fourth row) of the posterior density of candidate (<bold>M</bold>, <bold><italic>v</italic></bold>) solutions. Each block contains one row for each strain, ordered by decreasing frequency from top, with a SNP cell color ranging from purple (fraction of 0) to bright yellow (fraction of 1). Each reconstruction is shown as the input parameter <italic>n</italic> (of the number of strains) is varied from <italic>n</italic>&#x02009;=&#x02009;1 and 4 (row-major order) to be reconstructed in StrainRecon on the B_24NOPA_DBS sample. The dominant strain is captured perfectly and with high confidence by considering the posterior statistics. The algorithm has difficulty reconstructing the other two less-prevalent strains, since their target range of&#x02009;&#x0003c;&#x02009;4% is low relative to the experimental noise levels seen in the pipeline.</p></caption></media></supplementary-material><supplementary-material content-type="local-data" id="MOESM5"><media xlink:href="12936_2021_3624_MOESM5_ESM.docx"><caption><p><bold>Additional file 5: Figure S5.</bold> StrainRecon reconstruction on laboratory-mixed samples given advance knowledge of <italic>n</italic>&#x02009;=&#x02009;3 strains. Ground truth of barcodes (first row in each figure), the MAP estimate of the reconstruction matrix <bold>M</bold> and mixture vector <bold><italic>v</italic></bold> (second row), and the mean (third row) and standard deviation (fourth row) of the posterior density of candidate (<bold>M</bold>, <bold><italic>v</italic></bold>) solutions. Each block contains one row for each strain, ordered by decreasing frequency from top, with a SNP cell color ranging from purple (fraction of 0) to bright yellow (fraction of 1). The figure showcases the algorithm outputs with <italic>n</italic>&#x02009;=&#x02009;3 strains on a variety of samples and mixtures, including cases of accurate and unique strain reconstruction (such as B33_PA_DBS).</p></caption></media></supplementary-material></p></sec></sec></body><back><glossary><title>Abbreviations</title><def-list><def-item><term>MOI</term><def><p id="Par5">Multiplicity of infection</p></def></def-item><def-item><term>SNP</term><def><p id="Par6">Single nucleotide polymorphism</p></def></def-item><def-item><term>NGS</term><def><p id="Par7">Next generation sequencing</p></def></def-item><def-item><term>DBS</term><def><p id="Par8">Dried blood spot</p></def></def-item><def-item><term>DNA</term><def><p id="Par9">Deoxyribonucleic acid</p></def></def-item><def-item><term>PCR</term><def><p id="Par10">Polymerase chain reaction</p></def></def-item><def-item><term>REF</term><def><p id="Par11">Reference</p></def></def-item><def-item><term>ALT</term><def><p id="Par12">Alternative</p></def></def-item><def-item><term>PA</term><def><p id="Par13">Pre-amplification</p></def></def-item><def-item><term>NOPA</term><def><p id="Par14">Non-pre-amplification</p></def></def-item><def-item><term>B4Screening</term><def><p id="Par15">A novel pathway involving four new steps in bioinformatics pipeline</p></def></def-item><def-item><term>StrainRecon</term><def><p id="Par16">A novel mathematical algorithm for haplotype analysis and Strain reconstruction</p></def></def-item><def-item><term>MAP</term><def><p id="Par17">Maximum-a-posteriori</p></def></def-item><def-item><term>STIM</term><def><p id="Par18">A novel mathematical algorithm for MOI estimation</p></def></def-item><def-item><term>IQR</term><def><p id="Par19">Interquartile range</p></def></def-item></def-list></glossary><fn-group><fn><p><bold>Publisher's Note</bold></p><p>Springer Nature remains neutral with regard to jurisdictional claims in published maps and institutional affiliations.</p></fn><fn><p>Rebecca M. Mitchell and Zhiyong Zhou contributed equally to this work</p></fn></fn-group><sec><title>Supplementary Information</title><p>The online version contains supplementary material available at 10.1186/s12936-021-03624-2.</p></sec><ack><title>Acknowledgements</title><p>RMM was supported by American Society of Microbiology/Centers for Disease Control and Prevention Fellowship. The work was partially supported by Incubator Project fund from Office of Advanced Molecular Detection (OAMD) of CDC and Malaria Branch, DPDM of CDC. YV was partially supported by NSF CAREER Grant #1553579. The authors thank all families and people who participated in cross-sectional surveys in Kenya. The authors also thank Monica Shah, DPDM, CGH, CDC for assisting in data management for earlier cross-sectional surveys conducted in western Kenya, and appreciate Drs. Mateusz Plucinski and Patrick Kachur, DPDM, CGH, CDC for reviewing the manuscript and providing valuable suggestions. This paper is published with the permission of the director of the Kenya Medical Research Institute. The findings and conclusions in this paper are those of the authors and do not necessarily represent the official position of the Centers for Disease Control and Prevention.</p></ack><notes notes-type="author-contribution"><title>Authors&#x02019; contributions</title><p>YPS conceived, designed and coordinated the study and provided editorial input for the manuscript. RMM designed the experiment. RMM, ZZ and SS performed multiplex PCRs. MS and MF performed Illumina library preparation and MiSeq sequencing. RMM conducted bioinformatics work. VN and BH assisted with sequence cleaning and data management. RMM and YV performed mathematical model analysis, statistics, interface with StrainRecon, and work on STIM. ZZ submitted all Sequence Read Archive (SRA) data for this study to the NCBI BioProject. FK, JG, KL, LS, HM, MD, KO and SK led or participated in cross-sectional surveys including field sample collection, logistics, and epidemiological and entomological data collection and management in western Kenya. RMM, ZZ and YV wrote the manuscript. All authors read and approved the final manuscript.</p></notes><notes notes-type="data-availability"><title>Availability of data and materials</title><p>The source code for the B4Screening pathway for bioinformatics pipeline and STIM analysis tool is available online at: <ext-link ext-link-type="uri" xlink:href="https://www.ymsir.com/stim/">https://www.ymsir.com/stim/</ext-link>. All Sequence Read Archive (SRA) data for this study were submitted to NCBI BioProject under accession no. PRJNA555848, <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/sra/?term=PRJNA555848">https://www.ncbi.nlm.nih.gov/sra/?term=PRJNA555848</ext-link>.</p></notes><notes id="FPar1" notes-type="COI-statement"><title>Competing interests</title><p id="Par78">The authors declare that they have no competing interests.</p></notes><ref-list id="Bib1"><title>References</title><ref id="CR1"><label>1.</label><mixed-citation publication-type="other">WHO. World malaria report 2019. Geneva: World Health Organization. 2019. <ext-link ext-link-type="uri" xlink:href="https://www.whoint/malaria/publications/world-malaria-report-2019/en/">https://www.whoint/malaria/publications/world-malaria-report-2019/en/</ext-link>.</mixed-citation></ref><ref id="CR2"><label>2.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kilama</surname><given-names>M</given-names></name><name><surname>Smith</surname><given-names>DL</given-names></name><name><surname>Hutchinson</surname><given-names>R</given-names></name><name><surname>Kigozi</surname><given-names>R</given-names></name><name><surname>Yeka</surname><given-names>A</given-names></name><name><surname>Lavoy</surname><given-names>G</given-names></name><etal/></person-group><article-title>Estimating the annual entomological inoculation rate for <italic>Plasmodium falciparum</italic> transmitted by <italic>Anopheles gambiae</italic> s.l. using three sampling methods in three sites in Uganda</article-title><source>Malar J</source><year>2014</year><volume>13</volume><fpage>111</fpage><pub-id pub-id-type="pmid">24656206</pub-id></element-citation></ref><ref id="CR3"><label>3.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tusting</surname><given-names>LS</given-names></name><name><surname>Bousema</surname><given-names>T</given-names></name><name><surname>Smith</surname><given-names>DL</given-names></name><name><surname>Drakeley</surname><given-names>C</given-names></name></person-group><article-title>Measuring changes in <italic>Plasmodium falciparum</italic> transmission: precision, accuracy and costs of metrics</article-title><source>Adv Parasitol</source><year>2014</year><volume>84</volume><fpage>151</fpage><lpage>208</lpage><pub-id pub-id-type="pmid">24480314</pub-id></element-citation></ref><ref id="CR4"><label>4.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mboera</surname><given-names>LE</given-names></name></person-group><article-title>Sampling techniques for adult Afrotropical malaria vectors and their reliability in the estimation of entomological inoculation rate</article-title><source>Tanzan Health Res Bull</source><year>2005</year><volume>7</volume><fpage>117</fpage><lpage>124</lpage><pub-id pub-id-type="pmid">16941936</pub-id></element-citation></ref><ref id="CR5"><label>5.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hay</surname><given-names>SI</given-names></name><name><surname>Rogers</surname><given-names>DJ</given-names></name><name><surname>Toomer</surname><given-names>JF</given-names></name><name><surname>Snow</surname><given-names>RW</given-names></name></person-group><article-title>Annual <italic>Plasmodium falciparum</italic> entomological inoculation rates (EIR) across Africa: literature survey, Internet access and review</article-title><source>Trans R Soc Trop Med Hyg</source><year>2000</year><volume>94</volume><fpage>113</fpage><lpage>127</lpage><pub-id pub-id-type="pmid">10897348</pub-id></element-citation></ref><ref id="CR6"><label>6.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Arnot</surname><given-names>D</given-names></name></person-group><article-title>Unstable malaria in Sudan: the influence of the dry season: clone multiplicity of <italic>Plasmodium falciparum</italic> infections in individuals exposed to variable levels of disease transmission</article-title><source>Trans R Soc Trop Med Hyg</source><year>1998</year><volume>92</volume><fpage>580</fpage><lpage>585</lpage><pub-id pub-id-type="pmid">10326095</pub-id></element-citation></ref><ref id="CR7"><label>7.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Beck</surname><given-names>HP</given-names></name><name><surname>Felger</surname><given-names>I</given-names></name><name><surname>Vounatsou</surname><given-names>P</given-names></name><name><surname>Hirt</surname><given-names>R</given-names></name><name><surname>Tanner</surname><given-names>M</given-names></name><name><surname>Alonso</surname><given-names>P</given-names></name><etal/></person-group><article-title>Effect of iron supplementation and malaria prophylaxis in infants on <italic>Plasmodium falciparum</italic> genotypes and multiplicity of infection</article-title><source>Trans R Soc Trop Med Hyg</source><year>1999</year><volume>93</volume><fpage>41</fpage><lpage>45</lpage><pub-id pub-id-type="pmid">10450425</pub-id></element-citation></ref><ref id="CR8"><label>8.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mbugi</surname><given-names>EV</given-names></name><name><surname>Mutayoba</surname><given-names>BM</given-names></name><name><surname>Balthazary</surname><given-names>ST</given-names></name><name><surname>Malisa</surname><given-names>AL</given-names></name><name><surname>Nyambo</surname><given-names>TB</given-names></name><name><surname>Mshinda</surname><given-names>H</given-names></name></person-group><article-title>Multiplicity of infections and level of recrudescence in <italic>Plasmodium falciparum</italic> malaria in Mlimba, Tanzania</article-title><source>Afr J Biotechnol</source><year>2006</year><volume>5</volume><fpage>1655</fpage><lpage>1662</lpage></element-citation></ref><ref id="CR9"><label>9.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhong</surname><given-names>D</given-names></name><name><surname>Koepfli</surname><given-names>C</given-names></name><name><surname>Cui</surname><given-names>L</given-names></name><name><surname>Yan</surname><given-names>G</given-names></name></person-group><article-title>Molecular approaches to determine the multiplicity of <italic>Plasmodium</italic> infections</article-title><source>Malar J</source><year>2018</year><volume>17</volume><fpage>172</fpage><pub-id pub-id-type="pmid">29685152</pub-id></element-citation></ref><ref id="CR10"><label>10.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nkhoma</surname><given-names>SC</given-names></name><name><surname>Banda</surname><given-names>RL</given-names></name><name><surname>Khoswe</surname><given-names>S</given-names></name><name><surname>Dzoole-Mwale</surname><given-names>TJ</given-names></name><name><surname>Ward</surname><given-names>SA</given-names></name></person-group><article-title>Intra-host dynamics of co-infecting parasite genotypes in asymptomatic malaria patients</article-title><source>Infect Genet Evol</source><year>2018</year><volume>65</volume><fpage>414</fpage><lpage>424</lpage><pub-id pub-id-type="pmid">30145390</pub-id></element-citation></ref><ref id="CR11"><label>11.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wong</surname><given-names>W</given-names></name><name><surname>Griggs</surname><given-names>AD</given-names></name><name><surname>Daniels</surname><given-names>RF</given-names></name><name><surname>Schaffner</surname><given-names>SF</given-names></name><name><surname>Ndiaye</surname><given-names>D</given-names></name><name><surname>Bei</surname><given-names>AK</given-names></name><etal/></person-group><article-title>Genetic relatedness analysis reveals the cotransmission of genetically related <italic>Plasmodium falciparum</italic> parasites in Thi&#x000e8;s, Senegal</article-title><source>Genome Med</source><year>2017</year><volume>9</volume><fpage>5</fpage><pub-id pub-id-type="pmid">28118860</pub-id></element-citation></ref><ref id="CR12"><label>12.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ogouyemi-Hounto</surname><given-names>A</given-names></name><name><surname>Gazard</surname><given-names>DK</given-names></name><name><surname>Ndam</surname><given-names>N</given-names></name><name><surname>Topanou</surname><given-names>E</given-names></name><name><surname>Garba</surname><given-names>O</given-names></name><name><surname>Elegbe</surname><given-names>P</given-names></name></person-group><article-title>Genetic polymorphism of merozoite surface protein-1 and merozoite surface protein-2 in <italic>Plasmodium falciparum</italic> isolates from children in South of Benin</article-title><source>Parasite</source><year>2013</year><volume>20</volume><fpage>37</fpage><pub-id pub-id-type="pmid">24135216</pub-id></element-citation></ref><ref id="CR13"><label>13.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Anderson</surname><given-names>TJ</given-names></name><name><surname>Su</surname><given-names>XZ</given-names></name><name><surname>Bockarie</surname><given-names>M</given-names></name><name><surname>Lagog</surname><given-names>M</given-names></name><name><surname>Day</surname><given-names>KP</given-names></name></person-group><article-title>Twelve microsatellite markers for characterization of <italic>Plasmodium falciparum</italic> from finger-prick blood samples</article-title><source>Parasitology</source><year>1999</year><volume>119</volume><fpage>113</fpage><lpage>125</lpage><pub-id pub-id-type="pmid">10466118</pub-id></element-citation></ref><ref id="CR14"><label>14.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lerch</surname><given-names>A</given-names></name><name><surname>Koepfli</surname><given-names>C</given-names></name><name><surname>Hofmann</surname><given-names>NE</given-names></name><name><surname>Messerli</surname><given-names>C</given-names></name><name><surname>Wilcox</surname><given-names>S</given-names></name><name><surname>Kattenberg</surname><given-names>JH</given-names></name></person-group><article-title>Development of amplicon deep sequencing markers and data analysis pipeline for genotyping multi-clonal malaria infections</article-title><source>BMC Genom</source><year>2017</year><volume>18</volume><fpage>864</fpage></element-citation></ref><ref id="CR15"><label>15.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Miller</surname><given-names>RH</given-names></name><name><surname>Hathaway</surname><given-names>NJ</given-names></name><name><surname>Kharabora</surname><given-names>O</given-names></name><name><surname>Mwandagalirwa</surname><given-names>K</given-names></name><name><surname>Tshefu</surname><given-names>A</given-names></name><name><surname>Meshnick</surname><given-names>SR</given-names></name><etal/></person-group><article-title>A deep sequencing approach to estimate <italic>Plasmodium falciparum</italic> complexity of infection (COI) and explore apical membrane antigen 1 diversity</article-title><source>Malar J</source><year>2017</year><volume>16</volume><fpage>490</fpage><pub-id pub-id-type="pmid">29246158</pub-id></element-citation></ref><ref id="CR16"><label>16.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lerch</surname><given-names>A</given-names></name><name><surname>Koepfli</surname><given-names>C</given-names></name><name><surname>Hofmann</surname><given-names>NE</given-names></name><name><surname>Kattenberg</surname><given-names>JH</given-names></name><name><surname>Rosanas-Urgell</surname><given-names>A</given-names></name><name><surname>Betuela</surname><given-names>I</given-names></name><etal/></person-group><article-title>Longitudinal tracking and quantification of individual <italic>Plasmodium falciparum</italic> clones in complex infections</article-title><source>Sci Rep</source><year>2019</year><volume>9</volume><fpage>3333</fpage><pub-id pub-id-type="pmid">30833657</pub-id></element-citation></ref><ref id="CR17"><label>17.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mustonen</surname><given-names>L</given-names></name><name><surname>Gao</surname><given-names>X</given-names></name><name><surname>Santana</surname><given-names>A</given-names></name><name><surname>Mitchell</surname><given-names>R</given-names></name><name><surname>Vigfusson</surname><given-names>Y</given-names></name><name><surname>Ruthotto</surname><given-names>L</given-names></name></person-group><article-title>A Bayesian framework for molecular strain identification from mixed diagnostic samples</article-title><source>Inverse Probl</source><year>2018</year><volume>34</volume><fpage>105009</fpage></element-citation></ref><ref id="CR18"><label>18.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname><given-names>SJ</given-names></name><name><surname>Almagro-Garcia</surname><given-names>J</given-names></name><name><surname>McVean</surname><given-names>G</given-names></name></person-group><article-title>Deconvolution of multiple infections in <italic>Plasmodium falciparum</italic> from high throughput sequencing data</article-title><source>Bioinformatics</source><year>2018</year><volume>34</volume><fpage>9</fpage><lpage>15</lpage><pub-id pub-id-type="pmid">28961721</pub-id></element-citation></ref><ref id="CR19"><label>19.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Daniels</surname><given-names>R</given-names></name><name><surname>Volkman</surname><given-names>SK</given-names></name><name><surname>Milner</surname><given-names>DA</given-names></name><name><surname>Mahesh</surname><given-names>N</given-names></name><name><surname>Neafsey</surname><given-names>DE</given-names></name><name><surname>Park</surname><given-names>DJ</given-names></name></person-group><article-title>A general SNP-based molecular barcode for <italic>Plasmodium falciparum</italic> identification and tracking</article-title><source>Malar J</source><year>2008</year><volume>7</volume><fpage>223</fpage><pub-id pub-id-type="pmid">18959790</pub-id></element-citation></ref><ref id="CR20"><label>20.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Daniels</surname><given-names>R</given-names></name><name><surname>Chang</surname><given-names>HH</given-names></name><name><surname>Sene</surname><given-names>PD</given-names></name><name><surname>Park</surname><given-names>DC</given-names></name><name><surname>Neafsey</surname><given-names>DE</given-names></name><name><surname>Schaffner</surname><given-names>SF</given-names></name><etal/></person-group><article-title>Genetic surveillance detects both clonal and epidemic transmission of malaria following enhanced intervention in Senegal</article-title><source>PLoS ONE</source><year>2013</year><volume>8</volume><fpage>e60780</fpage><pub-id pub-id-type="pmid">23593309</pub-id></element-citation></ref><ref id="CR21"><label>21.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Galinsky</surname><given-names>K</given-names></name><name><surname>Valim</surname><given-names>C</given-names></name><name><surname>Salmier</surname><given-names>A</given-names></name><name><surname>Thoisy</surname><given-names>B</given-names></name><name><surname>Musset</surname><given-names>L</given-names></name><name><surname>Legrand</surname><given-names>E</given-names></name></person-group><article-title>COIL: a methodology for evaluating malarial complexity of infection using likelihood from single nucleotide polymorphism data</article-title><source>Malar J</source><year>2015</year><volume>14</volume><fpage>4</fpage><pub-id pub-id-type="pmid">25599890</pub-id></element-citation></ref><ref id="CR22"><label>22.</label><mixed-citation publication-type="other">Illumina. <ext-link ext-link-type="uri" xlink:href="https://support.illumina.com/downloads/16s_metagenomic_sequencing_library_preparation.html">https://support.illumina.com/downloads/16s_metagenomic_sequencing_library_preparation.html</ext-link>. 2016.</mixed-citation></ref><ref id="CR23"><label>23.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gatei</surname><given-names>W</given-names></name><name><surname>Gimnig</surname><given-names>JE</given-names></name><name><surname>Hawley</surname><given-names>W</given-names></name><name><surname>ter Kuile</surname><given-names>F</given-names></name><name><surname>Odero</surname><given-names>C</given-names></name><name><surname>Iriemenam</surname><given-names>NC</given-names></name><etal/></person-group><article-title>Genetic diversity of <italic>Plasmodium falciparum</italic> parasite by microsatellite markers after scale-up of insecticide-treated bed nets in western Kenya</article-title><source>Malar J</source><year>2015</year><volume>13</volume><fpage>495</fpage><pub-id pub-id-type="pmid">26651480</pub-id></element-citation></ref><ref id="CR24"><label>24.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Phillips-Howard</surname><given-names>PA</given-names></name><name><surname>Nahlen</surname><given-names>B</given-names></name><name><surname>Kolczak</surname><given-names>MS</given-names></name><name><surname>Hightower</surname><given-names>AW</given-names></name><name><surname>ter Kuile</surname><given-names>FO</given-names></name><name><surname>Alaii</surname><given-names>JA</given-names></name><etal/></person-group><article-title>Efficacy of permethrin-treated bed nets in the prevention of mortality in young children in an area of high perennial malaria transmission in western Kenya</article-title><source>Am J Trop Med Hyg</source><year>2003</year><volume>68</volume><fpage>23</fpage><lpage>29</lpage><pub-id pub-id-type="pmid">12749482</pub-id></element-citation></ref><ref id="CR25"><label>25.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gatei</surname><given-names>W</given-names></name><name><surname>Kariuki</surname><given-names>S</given-names></name><name><surname>Hawley</surname><given-names>W</given-names></name><name><surname>ter Kuile</surname><given-names>F</given-names></name><name><surname>Terlouw</surname><given-names>D</given-names></name><name><surname>Phillips-Howard</surname><given-names>P</given-names></name><etal/></person-group><article-title>Effects of transmission reduction by insecticide-treated bed nets (ITNs) on parasite genetics population structure: I. The genetic diversity of <italic>Plasmodium falciparum</italic> parasites by microsatellite markers in western Kenya</article-title><source>Malar J</source><year>2010</year><volume>9</volume><fpage>353</fpage><pub-id pub-id-type="pmid">21134282</pub-id></element-citation></ref><ref id="CR26"><label>26.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sisya</surname><given-names>TJ</given-names></name><name><surname>Kamn&#x02019;gona</surname><given-names>RM</given-names></name><name><surname>Vareta</surname><given-names>JA</given-names></name><name><surname>Fulakeza</surname><given-names>JM</given-names></name><name><surname>Mukaka</surname><given-names>MFJ</given-names></name><name><surname>Seydel</surname><given-names>KB</given-names></name><etal/></person-group><article-title>Subtle changes in <italic>Plasmodium falciparum</italic> infection complexity following enhanced intervention in Malawi</article-title><source>Acta Trop</source><year>2015</year><volume>142</volume><fpage>108</fpage><lpage>114</lpage><pub-id pub-id-type="pmid">25460345</pub-id></element-citation></ref><ref id="CR27"><label>27.</label><mixed-citation publication-type="other">StrainPycon. <ext-link ext-link-type="uri" xlink:href="https://www.ymsir.com/strainpycon">https://www.ymsir.com/strainpycon</ext-link>.</mixed-citation></ref><ref id="CR28"><label>28.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Benjamini</surname><given-names>Y</given-names></name><name><surname>Krieger</surname><given-names>AM</given-names></name><name><surname>Yekutieli</surname><given-names>D</given-names></name></person-group><article-title>Adaptive linear step-up procedures that control the false discovery rate</article-title><source>Biometrika</source><year>2006</year><volume>93</volume><fpage>491</fpage><lpage>507</lpage></element-citation></ref><ref id="CR29"><label>29.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Strommenger</surname><given-names>B</given-names></name><name><surname>Kettlitz</surname><given-names>C</given-names></name><name><surname>Werner</surname><given-names>G</given-names></name><name><surname>Witte</surname><given-names>W</given-names></name></person-group><article-title>Multiplex PCR assay for simultaneous detection of nine clinically relevant antibiotic resistance genes in <italic>Staphylococcus aureus</italic></article-title><source>J Clin Microbiol</source><year>2003</year><volume>41</volume><fpage>4089</fpage><lpage>4094</lpage><pub-id pub-id-type="pmid">12958230</pub-id></element-citation></ref><ref id="CR30"><label>30.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lau</surname><given-names>YL</given-names></name><name><surname>Lai</surname><given-names>MY</given-names></name><name><surname>Anthony</surname><given-names>CN</given-names></name><name><surname>Chang</surname><given-names>PY</given-names></name><name><surname>Palaeya</surname><given-names>V</given-names></name><name><surname>Fong</surname><given-names>MY</given-names></name><etal/></person-group><article-title>Comparison of three molecular methods for the detection and speciation of five human <italic>Plasmodium</italic> species</article-title><source>Am J Trop Med Hyg</source><year>2015</year><volume>92</volume><fpage>28</fpage><lpage>33</lpage><pub-id pub-id-type="pmid">25385862</pub-id></element-citation></ref><ref id="CR31"><label>31.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Koepfli</surname><given-names>C</given-names></name><name><surname>Schoepflin</surname><given-names>S</given-names></name><name><surname>Bretscher</surname><given-names>M</given-names></name><name><surname>Lin</surname><given-names>E</given-names></name><name><surname>Kiniboro</surname><given-names>B</given-names></name><name><surname>Zimmerman</surname><given-names>PA</given-names></name><etal/></person-group><article-title>How much remains undetected? Probability of molecular detection of human Plasmodia in the field</article-title><source>PLoS ONE</source><year>2011</year><volume>6</volume><fpage>e19010</fpage><pub-id pub-id-type="pmid">21552561</pub-id></element-citation></ref><ref id="CR32"><label>32.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Shendure</surname><given-names>J</given-names></name><name><surname>Ji</surname><given-names>H</given-names></name></person-group><article-title>Next-generation DNA sequencing</article-title><source>Nat Biotechnol</source><year>2008</year><volume>26</volume><fpage>1135</fpage><lpage>1145</lpage><pub-id pub-id-type="pmid">18846087</pub-id></element-citation></ref><ref id="CR33"><label>33.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Flaherty</surname><given-names>P</given-names></name><name><surname>Natsoulis</surname><given-names>G</given-names></name><name><surname>Muralidharan</surname><given-names>O</given-names></name><name><surname>Winters</surname><given-names>M</given-names></name><name><surname>Buenrostro</surname><given-names>J</given-names></name><name><surname>Bell</surname><given-names>J</given-names></name><etal/></person-group><article-title>Ultrasensitive detection of rare mutations using next-generation targeted resequencing</article-title><source>Nucleic Acids Res</source><year>2012</year><volume>40</volume><fpage>e2</fpage><pub-id pub-id-type="pmid">22013163</pub-id></element-citation></ref><ref id="CR34"><label>34.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tsiatis</surname><given-names>AC</given-names></name><name><surname>Norris-Kirby</surname><given-names>A</given-names></name><name><surname>Rich</surname><given-names>RG</given-names></name><name><surname>Hafez</surname><given-names>MJ</given-names></name><name><surname>Gocke</surname><given-names>CD</given-names></name><name><surname>Eshleman</surname><given-names>JR</given-names></name><etal/></person-group><article-title>Comparison of Sanger sequencing, pyrosequencing, and melting curve analysis for the detection of KRAS mutations: diagnostic and clinical implications</article-title><source>J Mol Diagn</source><year>2010</year><volume>12</volume><fpage>425</fpage><lpage>432</lpage><pub-id pub-id-type="pmid">20431034</pub-id></element-citation></ref><ref id="CR35"><label>35.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Messerli</surname><given-names>C</given-names></name><name><surname>Hofmann</surname><given-names>NE</given-names></name><name><surname>Beck</surname><given-names>H-P</given-names></name><name><surname>Felger</surname><given-names>I</given-names></name></person-group><article-title>Critical evaluation of molecular monitoring in malaria drug efficacy trials and pitfalls of length-polymorphic markers</article-title><source>Antimicrob Agents Chemother</source><year>2016</year><volume>61</volume><fpage>e01500</fpage><lpage>e1516</lpage><pub-id pub-id-type="pmid">27821442</pub-id></element-citation></ref><ref id="CR36"><label>36.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Greenhouse</surname><given-names>B</given-names></name><name><surname>Myrick</surname><given-names>A</given-names></name><name><surname>Dokomajilar</surname><given-names>C</given-names></name><name><surname>Woo</surname><given-names>JM</given-names></name><name><surname>Carlson</surname><given-names>EJ</given-names></name><name><surname>Rosenthal</surname><given-names>PJ</given-names></name></person-group><article-title>Validation of microsatellite markers for use in genotyping polyclonal <italic>Plasmodium falciparum</italic> infections</article-title><source>Am J Trop Med Hyg</source><year>2006</year><volume>75</volume><fpage>836</fpage><lpage>842</lpage><pub-id pub-id-type="pmid">17123974</pub-id></element-citation></ref><ref id="CR37"><label>37.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gruenberg</surname><given-names>M</given-names></name><name><surname>Lerch</surname><given-names>A</given-names></name><name><surname>Beck</surname><given-names>H-P</given-names></name><name><surname>Felger</surname><given-names>I</given-names></name></person-group><article-title>Amplicon deep sequencing improves <italic>Plasmodium falciparum</italic> genotyping in clinical trials of antimalarial drugs</article-title><source>Sci Rep</source><year>2019</year><volume>9</volume><fpage>17790</fpage><pub-id pub-id-type="pmid">31780741</pub-id></element-citation></ref><ref id="CR38"><label>38.</label><element-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ter Kuile</surname><given-names>FO</given-names></name><name><surname>Terlouw</surname><given-names>DJ</given-names></name><name><surname>Phillips-Howard</surname><given-names>PA</given-names></name><name><surname>Hawley</surname><given-names>WA</given-names></name><name><surname>Friedman</surname><given-names>JF</given-names></name><name><surname>Kolczak</surname><given-names>MS</given-names></name><etal/></person-group><article-title>Impact of permethrin-treated bed nets on malaria and all-cause morbidity in young children in an area of intense perennial malaria transmission in western Kenya: cross-sectional survey</article-title><source>Am J Trop Med Hyg</source><year>2003</year><volume>68</volume><issue>Suppl 4</issue><fpage>100</fpage><lpage>107</lpage><pub-id pub-id-type="pmid">12749492</pub-id></element-citation></ref></ref-list></back></article>