<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">Online J Public Health Inform</journal-id>
      <journal-title>Online Journal of Public Health Informatics</journal-title>
      <issn pub-type="epub">1947-2579</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v16i1e50771</article-id>
      <article-id pub-id-type="pmid">38625737</article-id>
      <article-id pub-id-type="doi">10.2196/50771</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Machine Learning for Prediction of Tuberculosis Detection: Case Study of Trained African Giant Pouched Rats</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Leung</surname>
            <given-names>Tiffany</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Mensah</surname>
            <given-names>Edward</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Best</surname>
            <given-names>Eric</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Schultz</surname>
            <given-names> Jörg</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Perepu</surname>
            <given-names>Sireesha</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Jonathan</surname>
            <given-names>Joan</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Informatics and Information Technology</institution>
            <institution>Sokoine University of Agriculture</institution>
            <addr-line>PO Box 3038</addr-line>
            <addr-line>Morogoro</addr-line>
            <country>United Republic of Tanzania</country>
            <phone>255 763 630 054</phone>
            <email>joanjonathan@sua.ac.tz</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4365-7067</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Barakabitze</surname>
            <given-names>Alcardo Alex</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8960-8415</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Fast</surname>
            <given-names>Cynthia D</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3764-565X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Cox</surname>
            <given-names>Christophe</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-3118-4491</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Informatics and Information Technology</institution>
        <institution>Sokoine University of Agriculture</institution>
        <addr-line>Morogoro</addr-line>
        <country>United Republic of Tanzania</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>APOPO Rodent Project</institution>
        <institution>Sokoine University of Agriculture</institution>
        <addr-line>Morogoro</addr-line>
        <country>United Republic of Tanzania</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Evolutionary Ecology Group, Department of Biology</institution>
        <institution>University of Antwerp</institution>
        <addr-line>Antwerp</addr-line>
        <country>Belgium</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Rutgers Center for Cognitive Science</institution>
        <addr-line>Piscataway, NJ</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Joan Jonathan <email>joanjonathan@sua.ac.tz</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>16</day>
        <month>4</month>
        <year>2024</year>
      </pub-date>
      <volume>16</volume>
      <elocation-id>e50771</elocation-id>
      <history>
        <date date-type="received">
          <day>12</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>7</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>27</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>3</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Joan Jonathan, Alcardo Alex Barakabitze, Cynthia D Fast, Christophe Cox. Originally published in the Online Journal of Public Health Informatics (https://ojphi.jmir.org/), 16.04.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Online Journal of Public Health Informatics, is properly cited. The complete bibliographic information, a link to the original publication on https://ojphi.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ojphi.jmir.org/2024/1/e50771" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Technological advancement has led to the growth and rapid increase of tuberculosis (TB) medical data generated from different health care areas, including diagnosis. Prioritizing better adoption and acceptance of innovative diagnostic technology to reduce the spread of TB significantly benefits developing countries. Trained TB-detection rats are used in Tanzania and Ethiopia for operational research to complement other TB diagnostic tools. This technology has increased new TB case detection owing to its speed, cost-effectiveness, and sensitivity.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>During the TB detection process, rats produce vast amounts of data, providing an opportunity to identify interesting patterns that influence TB detection performance. This study aimed to develop models that predict if the rat will hit (indicate the presence of TB within) the sample or not using machine learning (ML) techniques. The goal was to improve the diagnostic accuracy and performance of TB detection involving rats.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>APOPO (Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling) Center in Morogoro provided data for this study from 2012 to 2019, and 366,441 observations were used to build predictive models using ML techniques, including decision tree, random forest, naïve Bayes, support vector machine, and k-nearest neighbor, by incorporating a variety of variables, such as the diagnostic results from partner health clinics using methods endorsed by the World Health Organization (WHO).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The support vector machine technique yielded the highest accuracy of 83.39% for prediction compared to other ML techniques used. Furthermore, this study found that the inclusion of variables related to whether the sample contained TB or not increased the performance accuracy of the predictive model.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The inclusion of variables related to the diagnostic results of TB samples may improve the detection performance of the trained rats. The study results may be of importance to TB-detection rat trainers and TB decision-makers as the results may prompt them to take action to maintain the usefulness of the technology and increase the TB detection performance of trained rats.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>machine learning</kwd>
        <kwd>African giant pouched rat</kwd>
        <kwd>diagnosis</kwd>
        <kwd>tuberculosis</kwd>
        <kwd>health care</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>African giant pouched rats (<italic>Cricetomys ansorgei</italic>) are native to sub-Saharan Africa, making them resistant to local parasites and diseases [<xref ref-type="bibr" rid="ref1">1</xref>]. The term “pouched rat” refers to their large cheek pouches that are used for carrying food back to their burrows, where the food is either eaten or stored. These rats are nocturnal and omnivorous, eating various insects, fruits, and vegetables. They are large (adult males and females weigh about 1.3 kg and 1.2 kg, respectively) and are long-lived, averaging 8 years in captivity. Moreover, they have a highly developed olfactory capacity, enabling them to do specific detection tasks with training [<xref ref-type="bibr" rid="ref2">2</xref>]. As such, in 1997, APOPO (Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling or “Anti-Personnel Landmines Detection Product Development” in English) started researching how to train these rats for scent detection. APOPO is a Belgian nongovernmental organization whose mission is to protect people and the planet using scent detection animals [<xref ref-type="bibr" rid="ref3">3</xref>]. Rat pups born at APOPO’s breeding facility are weaned from their mother at 10 weeks old. Rats begin training in a custom-engineered line cage immediately after they are weaned. Training for tuberculosis (TB) detection takes place in this apparatus, which requires upwards of 9 months to master. Each rat’s home cage is outfitted with a clay nest pot to simulate the rat’s natural underground burrow, a wood shaving substrate, and unlimited access to water that is routinely infused with a multivitamin and electrolyte supplement. The majority of the diet of the rats is provided during training sessions in the form of crushed commercial rodent chow pellets mixed with mashed bananas and avocados, which serves as appetitive reinforcement for the operant conditioning procedures. This diet is supplemented with a variety of fresh fruits, vegetables, and grains [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        <p>While APOPO began with training rats to detect landmines in former conflict zones, the demonstrated success influenced the 2001 idea to also train the rats to detect the presence of Mycobacterium tuberculosis in human sputum samples [<xref ref-type="bibr" rid="ref4">4</xref>]. Data reported annually to the World Health Organization (WHO) by countries show that TB is one of the major causes of ill health and death worldwide. TB is a life-threatening infectious disease that attacks the lungs and can also harm other parts of the body. The transmission occurs from one person to another when a person with TB talks, sneezes, or coughs. The development of novel, accurate, robust, and rapid diagnostic capabilities will result in improved case detection, disease surveillance, health care delivery, and quality of future research [<xref ref-type="bibr" rid="ref5">5</xref>]. In 2004, APOPO and Sokoine University of Agriculture (SUA) partnered with the Tanzanian National Institute of Medical Research (NIMR) and the Tanzanian National Tuberculosis and Leprosy Program (NTLP) to develop a scent-detection technology for diagnosing human TB in resource-poor areas [<xref ref-type="bibr" rid="ref6">6</xref>]. While microscopy is the most commonly used method to detect TB in developing countries, its effectiveness remains a problem [<xref ref-type="bibr" rid="ref3">3</xref>]. In Tanzania, the Ministry of Health, Community Development, Gender, Elders, and Children (MOHCDGEC) permitted APOPO to conduct research using rats to detect TB bacteria in sputum samples [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates the concept of rat scent detection of TB. Sputum samples collected from partner DOTS (directly-observed treatment, short-course) clinics are heat inactivated (autoclaved) and then loaded into aluminum bars, which are positioned beneath holes in the floor of the line cage apparatus [<xref ref-type="bibr" rid="ref4">4</xref>]. The rat sniffs each sample in succession as it walks from one side of the apparatus to the other. The rats are trained to pause over TB-positive samples for about 3 seconds but to quickly move past TB-negative samples [<xref ref-type="bibr" rid="ref1">1</xref>]. During operational research, rats are rewarded with food for correctly pausing over (or “indicating”) samples that the DOTS clinic has determined to be TB positive. Samples which the DOTS clinic determines to be TB negative but which the rat indicates as TB positive (by pausing for 3 seconds) are flagged as suspect and subjected to additional confirmatory diagnostics in APOPO’s laboratory, using WHO-endorsed methods (typically, concentrated smear microscopy). During routine operational research, APOPO’s scent detection rats evaluate upwards of 100 samples (averaging 10% TB positive) from DOTS clinics within each 20-minute session. Referencing sample and patient information within a secure database allows APOPO to immediately notify the DOTS clinic of new cases so the patient can be contacted and can begin treatment. This procedure has effectively identified more than 29,000 TB patients who had a missed diagnosis prior to evaluation by TB-detection rats [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Tuberculosis (TB) testing and detection using trained rats. The rats test and detect TB-negative and TB-positive samples.</p>
          </caption>
          <graphic xlink:href="ojphi_v16i1e50771_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Conceptual Framework</title>
        <p>The theoretical concepts and empirical framework of this study are based on Signal Detection Theory (SDT). SDT describes how features of the stimulus and detector factors affect performance on stimulus detection tasks [<xref ref-type="bibr" rid="ref8">8</xref>]. SDT helps to distinguish between the sensitivity of a detector and the underlying signal. In medical diagnosis, this translates to the efficacy of a diagnostic tool to accurately detect the presence of a pathogen or other signal with medical significance [<xref ref-type="bibr" rid="ref9">9</xref>], that is, the diagnostic “sensitivity.” However, in rats, determining diagnostic accuracy depends on the rat’s training and the diagnostic results from partner health clinics using WHO-endorsed methods. During training, the behavior of each rat is recorded, including indication responses committed in response to samples known to either contain or not contain TB (TB positive or TB negative). These data allow trainers to accurately track each rat’s discrimination learning [<xref ref-type="bibr" rid="ref4">4</xref>]. There are numerous independent variables related to each rat evaluation session, including the rat’s identity (name), age, sex, and bodyweight, as well as the characteristics of the sample itself, including DOTS clinic diagnostic results (ID_BL_DOTS) and results of any applicable confirmatory diagnosis within APOPO’s laboratory (ID_BL_APOPO), which are combined to form another independent variable called TB_Status.</p>
        <p>In this study, one of the primary dependent variables was captured as hit, which refers to whether or not (true or false) the rat provided an indication (continuously sniffed the sample for at least 3 seconds, as estimated by the rat handler). Combining the hit variable with WHO-endorsed diagnostic results (ID_BL_DOTS and ID_BL_APOPO) provided 4 possible outcomes termed rat performance for each sample evaluated (<xref rid="figure2" ref-type="fig">Figure 2</xref>), including correct hit, miss, false alarm, and correct reject, which are used in determining the diagnostic accuracy of each rat. Correct hit refers to samples that the rat indicated and were confirmed to contain TB; false alarm (or suspect) refers to samples that the rat indicated but which could not be confirmed to contain TB. Additionally, miss (sample confirmed to be TB positive) and correct reject (no TB mycobacterium confirmed) refer to samples that the rat failed to indicate (sniff for 3 seconds) [<xref ref-type="bibr" rid="ref3">3</xref>]. In other words, the rat’s sensitivity represents the percentage of correct hits out of the sum of total correct hits and total misses (all confirmed TB-positive samples evaluated by the rat). Similarly, the rat’s specificity represents the percentage of correct rejects out of the sum of correct rejects and false alarms (all samples found to be TB negative) [<xref ref-type="bibr" rid="ref10">10</xref>]. By this logic, sensitivity refers to the rat’s ability to accurately find true positive (TP) cases, while specificity measures its ability to accurately reject negative cases. Hence, sensitivity (correct hit) and specificity (correct reject) together comprise overall diagnostic accuracy.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Relationship among the status of tuberculosis (TB), hit, and the performance of the rat. Hit refers to whether or not (true or false) the rat provided an indication.</p>
          </caption>
          <graphic xlink:href="ojphi_v16i1e50771_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>From <xref rid="figure2" ref-type="fig">Figure 2</xref>, if the TB status was already known to be positive at the time of the rat evaluation and hit was true, the rat’s behavior was categorized as “correct hit.” Conversely, if the TB status was positive and hit was false, the rat’s behavior was categorized as “miss.” On the other hand, if the TB status was determined to be negative at the time of the rat evaluation and hit was true, the rat’s behavior was categorized as “false alarm” or suspect. Finally, if the TB status was negative and hit was false, the rat’s behavior was categorized as “correct reject.”</p>
        <p>Hence, contrary to the study by Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>], this study considered the status of TB in the sample the rat was evaluating. In that study, the modeling methods only used the dichotomous variable of hit as true or false (ie, did the rat sniff the sample for ≥3 seconds) without regard for what the rat was sniffing. Within the data set analyzed, about 78.8% of samples were not hit (hit=false), somewhat reflecting the estimated underlying prevalence of TB across the samples. However, assuming this distribution reflected that the most common outcome (hit=false) served as the desired or correct outcome in all instances when modeling rat performance, the models predicted when a trained rat would fail to detect TB (ie, miss a TB-positive sample or correctly reject a TB-negative sample) rather than detect it. Furthermore, the predictive power of the models did not take into account what the rats were smelling, since the rats were trained to perform differently (hit true or false) depending on the presence of TB within the sample. Therefore, the aim of this study was to replicate the procedures of the study by Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>] but with the inclusion of variables related to the detection of TB and with expansion of modeling to include 2 additional machine learning (ML) algorithms.</p>
      </sec>
      <sec>
        <title>Objectives of the Study</title>
        <p>This study applied the same data set from APOPO’s TB-detection rat training and research center in Morogoro, Tanzania, as used by Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>] but with the inclusion of WHO-endorsed diagnostic results, including those provided by partner DOTS clinics (smear microscopy, ID_BL_DOTS) and, where applicable, those performed by APOPO (either concentrated smear microscopy, ID_BL_FM, or fluorescent microscopy, ID_BL_APOPO) to confirm samples flagged suspect by the rats. As with Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>], this study used the decision tree, random forest, and naïve Bayes algorithms and included support vector machine (SVM) and k-nearest neighbor (kNN) ML techniques to improve the accuracy of the predictive models. Furthermore, it provides extensive simulations using real data to determine if ML techniques can accurately predict the performance of rat TB detection. Additionally, this paper compares the classification accuracy performance of the 5 ML predictive models. The rest of this paper is organized as follows: the Related Work subsection provides details of related literature focusing on African giant pouched rat TB detection, including the current status and its implications, along with the application of ML in diagnosing and detecting TB; the Methods section presents the methodology of this study; the Results section provides a description of the performance results and performance measurements of the predictive models; and the Discussion section discusses the findings, provides conclusions, and mentions the scope for future work.</p>
      </sec>
      <sec>
        <title>Related Work</title>
        <sec>
          <title>Diagnosis of TB by African Giant Pouched Rats: Current Status and its Implications</title>
          <p>African giant pouched rats cost-efficiently complement other TB diagnostic tools through second-line screening via scent detection to increase TB case detection. Patient samples are provided by partner DOTS clinics that perform initial screening. The rats can test up to 100 samples in 20 minutes or less, while a laboratory technician requires about 4 days to accomplish the same task using microscopy [<xref ref-type="bibr" rid="ref11">11</xref>]. Samples that the clinic deems TB negative but which the rats indicate are TB positive are then retested using WHO-endorsed methods, such as concentrated smear microscopy or GeneXpert. Samples that are confirmed positive are communicated to the respective DOTS clinic, effectively providing 24-hour result turnaround and improved linkage to care [<xref ref-type="bibr" rid="ref6">6</xref>]. Applying this method since 2007 has enabled TB-detection rats to identify more than 29,000 patients who had a missed diagnosis during initial screening [<xref ref-type="bibr" rid="ref4">4</xref>]. Thus, rat scent detection technology is of great importance to the community and public health hospitals because it increases case detection, enables treatment, and curbs the spread of the disease [<xref ref-type="bibr" rid="ref3">3</xref>].</p>
        </sec>
        <sec>
          <title>Application of ML and Big Data Analytics in Diagnosing and Detecting TB</title>
          <p>Technology advancement has allowed access to data from multidimensional sources with high throughput velocity. The term used to describe this kind of data is “big data,” which is difficult to analyze for interesting patterns or inefficiencies without ML technologies [<xref ref-type="bibr" rid="ref12">12</xref>]. The application of ML in health care is important to improve human health, and ML and big data analytic technologies have brought advancements in TB health care services owing to the increase of health care data and the availability of analytics to solve health problems [<xref ref-type="bibr" rid="ref13">13</xref>]. ML is a technology that enables a machine to learn from past data and predict the outcome. Thus, in health care, ML contains sophisticated algorithms that help to learn features from a large volume of health care data and then use the obtained insights to assist clinical practices [<xref ref-type="bibr" rid="ref14">14</xref>]. Big data analytics is the use of advanced analytic techniques on vast amounts of data in different formats, such as structured, semistructured, and unstructured data, from different sources. Big data analytics can help to discover useful information that facilitates decision-making and health care outcome prediction. Therefore, ML and big data analytics can assist physicians by providing up-to-date medical information from clinical practices for proper patient care. As such, the application of ML and big data analytics can help to reduce diagnostic and human errors in the outcomes of clinical practices [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
          <p>ML in health care depends on different techniques, which include classification, clustering, and association, for its operation. These techniques help to learn past data and detect knowledge patterns [<xref ref-type="bibr" rid="ref16">16</xref>]. Classification techniques are used to develop models that predict future events from the manipulated data and offer solutions to real-world health problems such as diagnosis and treatment of diseases [<xref ref-type="bibr" rid="ref16">16</xref>]. Classification is the ML technique that operates by building predictive models that categorize and assign labels to manipulated and newly encountered instances [<xref ref-type="bibr" rid="ref16">16</xref>]. These predictive models help solve multiclassification problems through prediction and analysis. Moreover, the models are used as decision-support tools that help medical professionals interpret diagnosis results [<xref ref-type="bibr" rid="ref17">17</xref>]. For example, Abdar et al [<xref ref-type="bibr" rid="ref18">18</xref>] used the boosted C5.0 and CHAID classification algorithms to build a decision tree model for the early diagnosis and prediction of liver disease. In addition, ML technologies were used in the diagnosis of TB to categorize and find relationships among the manipulated variables [<xref ref-type="bibr" rid="ref19">19</xref>]. This study developed an efficient and reliable framework for automatic TB bacilli detection based on deep learning and ML algorithms. The study also suggested that a classification model can be used to discriminate between positive and negative samples [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
          <p>The classification algorithms recently used in the diagnosis of TB include decision tree, random forest, naïve Bayes, SVM, and kNN [<xref ref-type="bibr" rid="ref20">20</xref>]. These algorithms are suggested as an alternative for health care professionals to improve the diagnosis of TB. The decision tree algorithm C4.5 was used to build a model to predict the presence of TB bacteria. The results showed that the decision tree had a prediction accuracy of 99% [<xref ref-type="bibr" rid="ref21">21</xref>]. The decision tree generates rules that are simple and easy to understand and interpret for a decision maker [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
          <p>Moreover, a random forest classification algorithm was used to discriminate the TB bacilli with a sensitivity and specificity of above 89.34% and 62.89%, respectively. Furthermore, it is proposed that the naïve Bayes algorithm can be used for the diagnosis of TB [<xref ref-type="bibr" rid="ref22">22</xref>]. Additionally, SVM is known as a useful model to identify abnormalities in the lungs for the diagnosis of TB [<xref ref-type="bibr" rid="ref23">23</xref>]. Following this, algorithm comparison is of great importance to find a reliable algorithm in the given data [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>ML Algorithms</title>
        <p>In this study, the ML algorithms used are decision tree, random forest, naïve Bayes, SVM, and kNN to build predictive models that categorize data and assign a label to manipulated and newly encountered data. The purpose of involving different algorithms is to compare and improve the prediction accuracy of rats for TB detection.</p>
      </sec>
      <sec>
        <title>Real Data Sets</title>
        <p>This paper used 2 data sets provided by APOPO: detection rats data set and RAT_WEIGHT data set, which were combined to form the final data set, as shown in <xref ref-type="table" rid="table1">Table 1</xref>. The detection rats data set contained 471,133 observations from 2011 to 2019 and involved 18 variables (17 independent and 1 dependent). The RAT_WEIGHT data set contained 1438 records collected from 2012 to 2019 and involved 4 independent variables. Moreover, these data contained 5 female rats with IDs 56, 72, 80, 85, and 96. However, the fifth rat with ID 96 from the RAT_WEIGHT data set was eliminated in the analysis because it lacked the necessary detection performance variables in the detection rats data set. Therefore, 4 female rats were used in this study. The 2 data sets and corresponding variables are displayed in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Rats data set description.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="120"/>
            <col width="240"/>
            <col width="120"/>
            <col width="380"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Data set and number</td>
                <td>Variable name</td>
                <td>Data type</td>
                <td>Description</td>
                <td>Variable type</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Detection rats data set</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1</td>
                <td>DOTS_NAME</td>
                <td>String</td>
                <td>Name of the DOTS<sup>a</sup> center</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2</td>
                <td>DOTS_PATIENTS_NUMBER</td>
                <td>Integer</td>
                <td>Number of patients from the DOTS center</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3</td>
                <td>ENTRY_YEAR</td>
                <td>Integer</td>
                <td>Year when the patient attended the DOTS center</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>4</td>
                <td>ID_SAMPLE</td>
                <td>Integer</td>
                <td>Identification of the sample</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>5</td>
                <td>ID_BL_DOTS</td>
                <td>Integer</td>
                <td>Identification of the bacteria level from the DOTS center</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>6</td>
                <td>HIT</td>
                <td>Boolean</td>
                <td>TB<sup>b</sup> detection rat performance (categorical variable)</td>
                <td>Dependent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>7</td>
                <td>ID_BL_APOPO</td>
                <td>Integer</td>
                <td>Identification of the bacteria level from the APOPO<sup>c</sup> center</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>8</td>
                <td>ID_CONFIGURATION</td>
                <td>Integer</td>
                <td>Identification of the cage during training</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>9</td>
                <td>ID_BL_FM</td>
                <td>Integer</td>
                <td>Identification of the bacteria level by fluorescence microscopy</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>10</td>
                <td>ID_EVALUATION_SESSION</td>
                <td>Integer</td>
                <td>Identification of the evaluation session</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>11</td>
                <td>SESSION_DATE</td>
                <td>Date</td>
                <td>Date when a session was performed</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>12</td>
                <td>ID_RAT</td>
                <td>Integer</td>
                <td>Identification of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>13</td>
                <td>RAT_NAME</td>
                <td>String</td>
                <td>Name of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>14</td>
                <td>GENDER</td>
                <td>String</td>
                <td>Sex of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>15</td>
                <td>AGE</td>
                <td>Integer</td>
                <td>Age of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>16</td>
                <td>START_TIME</td>
                <td>DateTime</td>
                <td>Date and time when the detection task started</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>17</td>
                <td>END_TIME</td>
                <td>DateTime</td>
                <td>Date and time when the detection task ended</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>18</td>
                <td>DOB</td>
                <td>Date</td>
                <td>Date when the rat was born</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>RAT_WEIGHT data set</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>1</td>
                <td>ID_RAT</td>
                <td>Integer</td>
                <td>Identification of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>2</td>
                <td>RAT_NAME</td>
                <td>String</td>
                <td>Name of the rat</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>3</td>
                <td>WEIGHT_DATE<break/>  <break/>  </td>
                <td>Date</td>
                <td>Date when the weight of the rat was measured</td>
                <td>Independent</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>4</td>
                <td>WEIGHT</td>
                <td>Integer</td>
                <td>Weight of the rat</td>
                <td>Independent</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>DOTS: directly-observed treatment, short-course.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>TB: tuberculosis.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>APOPO: Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Applied Variables</title>
        <p>The data underwent initial preprocessing to obtain the required variables for developing the predictive models. All data preparation was implemented by Python owing to its large number of libraries for scientific computing and the development of ML predictive models [<xref ref-type="bibr" rid="ref24">24</xref>]. The sample (either TB negative or the bacterial concentration of TB positivity provided by the partner DOTS clinic, ID_BL_DOTS) was compared to APOPO’s confirmatory diagnosis (where applicable) using concentrated smear microscopy (ID_BL_APOPO) to create a variable termed Definitive_Status. This variable reflected the APOPO result when one was provided; otherwise, it indicated the DOTS clinic result. The Definitive_Status was then transformed into the dichotomous variables of TB_Status to reflect the final status of the sample as either positive or negative for TB (collapsing across bacterial concentrations for positive samples). Then, TB_Status was compared to hit to compute the dependent variable of Rat_Performance, which consists of 4 categories: correct hit, miss, false alarm, and correct reject (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <p>After the data preparation, 4 variables for the detection performance of the rats, including TB_Status, age, weight, and hit, as shown in <xref ref-type="table" rid="table2">Table 2</xref>, were used to build the predictive model. Moreover, this study used 366,441 observations for analysis after removing the null rows from the rats data set to prevent noises, outliers, and inconsistencies in the data. The sklearn model selection library through a train-test split class was used to partition the data (366,441 observations) into 256,508 observations (70%) in the training data and 109,933 observations (30%) in the test data. It is important to mention that, due to the binary nature of many variables and the underlying prevalence of TB infections, the data used in this study lack a normal distribution, as shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>Categorical variables were used to build predictive models, and 256,508 observations (70%) were used for training the models. The TB_Status variable consisted of 10.90% (27,950/256,508) positive samples and 89.10% (228,558/256,508) negative samples. The hit variable consisted of 21.33% (54,719/256,508) true values and 78.67% (201,789/256,508) false values.</p>
        <p><xref ref-type="table" rid="table3">Table 3</xref> shows a statistical summary of the distribution of continuous variable data before and after the random data split. Despite most of the distributions being the same, the mean of age and weight variables showed a difference of 0.01. Moreover, the SD of ID_RAT and weight differed by 0.01.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Description of the dependent and independent variables used to build predictive models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="520"/>
            <col width="80"/>
            <col width="120"/>
            <col width="180"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>Description</td>
                <td>Data type</td>
                <td>Variable type</td>
                <td>Value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>TB_Status</td>
                <td>Final diagnosis of the sample as either TB<sup>a</sup> positive or TB negative. Combines the diagnostic results of both DOTS<sup>b</sup> and APOPO<sup>c</sup> (lab confirmation, when applicable) wherein APOPO status (results) overrides DOTS.</td>
                <td>Object</td>
                <td>Independent/categorical</td>
                <td>True or false</td>
              </tr>
              <tr valign="top">
                <td>Age</td>
                <td>Age of the rat in years at the time when the rat evaluated the patient sample in question</td>
                <td>Object</td>
                <td>Independent</td>
                <td>Age ranges from 0.79 to 7.95 years</td>
              </tr>
              <tr valign="top">
                <td>Weight</td>
                <td>Average rat body weight (in grams) per year because most of DetectionRatsData describes the daily detection tasks and misses their corresponding weights since the weight of the rats from the RAT_WEIGHT data set was measured every week.</td>
                <td>Object</td>
                <td>Independent</td>
                <td>Average rat body weight ranges from 843.67 to 1054.83 grams</td>
              </tr>
              <tr valign="top">
                <td>Hit<sup>d</sup></td>
                <td>Defined as a continuous sniff (nose insertion into the cage hole) for ≥3 seconds. True means the rat “indicated” that the sample contained TB (held its nose in the hole for at least 3 seconds). False means the rat rejected the sample (did not hold its nose for at least 3 seconds).</td>
                <td>Object</td>
                <td>Dependent/categorical</td>
                <td>True or false</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>TB: tuberculosis.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>DOTS: directly-observed treatment, short-course.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>APOPO: Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>Hit refers to whether or not (true or false) the rat provided an indication.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Descriptive statistics of the continuous variables used to build predictive models before and after random data split.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="370"/>
            <col width="300"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Data split status and variable</td>
                <td>Age (years)</td>
                <td>Weight (g)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>Before random data split (n=366,441, 100%)</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mean</td>
                <td>3.83</td>
                <td>899.40</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SD</td>
                <td>1.72</td>
                <td>84.37</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>IQR</td>
                <td>3.71</td>
                <td>866.80</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Minimum</td>
                <td>0.79</td>
                <td>843.67</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maximum</td>
                <td>7.95</td>
                <td>1054.83</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>After random data split (n=256,508, 70%)</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Mean</td>
                <td>3.84</td>
                <td>899.41</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>SD</td>
                <td>1.72</td>
                <td>84.36</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>IQR</td>
                <td>3.71</td>
                <td>866.80</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Minimum</td>
                <td>0.79</td>
                <td>843.67</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Maximum</td>
                <td>7.95</td>
                <td>1054.83</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Building</title>
        <p>The predictive model in this study was developed using 5 different ML techniques: decision tree, random forest, naïve Bayes, SVM, and kNN. This study used Python libraries for data preprocessing, matrix processing, mathematical functions, visualization, and classification. These are Pandas, Numpy, Matplotlib, and Scikit-learn [<xref ref-type="bibr" rid="ref25">25</xref>]. The repetitive approach was used to generate a decision tree by dividing the training data. The data were divided recursively until the same class of variables, depending on conditions, using roughly 15,000 samples per leaf, were distributed among each division to create the decision tree. After that, each node in the decision tree used a split point to test the altered variables and choose how to divide the data. The split decision was concerned with the information gain and entropy of a computed variable. The variable that had the greatest information gain and the least entropy was therefore divided and put to the test. The choice regarding the data split and decision tree building was made based on information gain and entropy [<xref ref-type="bibr" rid="ref16">16</xref>]. This study used pruning to maintain control over the parameters being used to remedy expansion.</p>
        <p>During the training procedure, many decision trees were randomly constructed using the random forest technique. Based on the provided manipulated variables, the algorithm’s ultimate decision was based on the selection of the majority of the trees. There was a connection between the outcome and the number of trees in the forest. The outcome was therefore more accurate with an increase in the number of trees. As a result, the technique handled 500 trees in the ensemble, and it calculated the error rate using the training set of information. In the random forest approach, the training data were used to generate random splits for the root node and variable node. Since there was no parameter control during training, the connection between trees remained strong. Additionally, the frequency and values of the adjusted variables from the provided data were counted to generate the classification model using the naïve Bayes method. This method determined the dependent variable’s a priori probabilities as well as the conditional probabilities for each independent variable based on the altered data. The naïve Bayes technique has been specifically utilized to contrast its prediction performance with the outcomes produced by other ML techniques. It does not display the weights of each variable included in the classification.</p>
        <p>SVM is one of the most common supervised ML algorithms owing to its greater predictive power. SVM analyzes data, recognizes patterns, and produces input-output functions from a set of labeled training data. It works by classifying a response variable by drawing a decision boundary line or hyperplane to separate 2 classes. Then, the maximum margin hyperplanes are constructed to optimally separate the output classes from each other in the training data. The goal is to find the optimal separating hyperplane where the separating margin is maximized. The linear kernel was used to allow flexibility and loss functions. The kNN algorithm is a supervised ML algorithm that works by identifying a set of k-nearest observations to the test point and calculating mainly the Euclidean distance between an observation and its kNN in training data. The k in kNN refers to the number of nearest neighbors the classifier will retrieve and use to make its prediction. The chosen k in kNN was 1, as it is suggested to provide the best test prediction.</p>
      </sec>
      <sec>
        <title>Performance Measurements</title>
        <p>This study used accuracy, specificity, sensitivity, and F1 score as metrics to evaluate the performance of the generated predictive models and compare classification performances. These measurements were supported in the <italic>scikit-learn</italic> library through the classification report class.</p>
        <sec>
          <title>Accuracy</title>
          <p>The classification accuracy was calculated based on the confusion matrix, which accurately categorized the actual class labels of the test data and the class labels of the predicted models. It was also obtained by dividing the number of truly classified instances by the number of instances in the test phase. Accuracy considers TP, true negative (TN), false positive (FP), and false negative (FN). The classification accuracy for the data set was measured according to the following formula:</p>
          <disp-formula>Accuracy = (TP + TN) / (TP + FP + TN + FN) <bold>(1)</bold></disp-formula>
        </sec>
        <sec>
          <title>Sensitivity</title>
          <p>Sensitivity is defined as the number of TP cases over the number of TP cases plus the number of FN cases. Sensitivity identifies the correct positive predictions relative to the total actual positive cases. It is sometimes called a recall metric. The formula of sensitivity is as follows:</p>
          <disp-formula>Sensitivity = TP / (TP + FN) <bold>(2)</bold></disp-formula>
        </sec>
        <sec>
          <title>Specificity</title>
          <p>Specificity is the ratio between TN cases and all negative cases. In this study, the precision measure identified the correct positive predictions relative to total positive predictions. For diagnostic tools, this could be termed positive predictive value (PPV) or precision. It essentially provides confidence that any given positive response reflects a truly positive condition [<xref ref-type="bibr" rid="ref25">25</xref>]. The formula of specificity is as follows:</p>
          <disp-formula>Specificity = TN / (TN + FP) <bold>(3)</bold></disp-formula>
        </sec>
        <sec>
          <title>F1 Score</title>
          <p>The F1 score is the harmonic mean of specificity and sensitivity. Basically, it is the weighted average of specificity and sensitivity. The F1 score was calculated from the specificity and sensitivity of the test data set [<xref ref-type="bibr" rid="ref25">25</xref>]. The formula of the F1 score is as follows:</p>
          <disp-formula>F1 score = 2 ([Precision × Sensitivity] / [Precision + Sensitivity]) <bold>(4)</bold></disp-formula>
          <p>It is important to mention that specificity and sensitivity are similar to precision and recall, respectively.</p>
        </sec>
      </sec>
      <sec>
        <title>Restrictions of the Study</title>
        <p>This study ran the predictive models on a computer with a Core i5-5300U CPU at 2.30 GHz (2301 MHz, 2 cores, 4 logical processors) and 8 GB of RAM. The sample size, on the other hand, was small, with only 4 rats and a gender imbalance. Moreover, the hit variable consisted of fewer true values (21.26%) than false values (78.74%).</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study was approved by the SUA (DPRTC/R/142/vol.01/104) and Medical Research Coordinating Committee of Tanzania (NIMR/HQ/R.8a/Vol.1X/3905). The use of African giant pouched rats as a potential tool for TB diagnosis has received ethics clearance from the Tanzanian Medical Research Coordinating Committee [<xref ref-type="bibr" rid="ref26">26</xref>]. The Office of Laboratory Animal Welfare has approved APOPO’s Animal Welfare Assurance (OLAW; Assurance Identification Number A5720-01).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Comparing Classification Performance Measurements of the Predictive Models</title>
        <p>This study used different ML techniques to build the predictive models following the methodology presented in <xref rid="figure3" ref-type="fig">Figure 3</xref>. Moreover, this study employed several metrics, including accuracy, sensitivity, specificity, and F1 score, to measure the classification performance of the predictive models based on test data. <xref rid="figure4" ref-type="fig">Figure 4</xref> shows the confusion matrices of the SVM and random forest classifiers, while <xref ref-type="table" rid="table4">Table 4</xref> summarizes the performance of all 5 ML techniques used to build the predictive models. The accuracy classification performance of the kNN technique was low at about 81.25%, while the best performing algorithm was SVM. As it can be seen from <xref ref-type="table" rid="table4">Table 4</xref>, validation showed that the SVM classifier based on the 4 variables shown in <xref ref-type="table" rid="table2">Table 2</xref> achieved an accuracy of 83.39%, but it also reported that SVM had better ability to recognize the status of TB as either positive or negative in a given sample. </p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Process flow of machine learning–based prediction models of rat tuberculosis detection performance. The rectangle symbols represent data, while the histogram entails model evaluation metrics. DT: decision tree; kNN: k-nearest neighbor; NB: naïve Bayes; RF: random forest; SVM: support vector machine.</p>
          </caption>
          <graphic xlink:href="ojphi_v16i1e50771_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Confusion matrices of the predictive models. (A) Support vector machine classifier; (B) Random forest classifier.</p>
          </caption>
          <graphic xlink:href="ojphi_v16i1e50771_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparing the classification performance of classifiers of rat tuberculosis detection.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="370"/>
            <col width="100"/>
            <col width="110"/>
            <col width="100"/>
            <col width="170"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td rowspan="2">Classification performance measurement</td>
                <td colspan="5">Predictive model</td>
              </tr>
              <tr valign="top">
                <td>Decision tree</td>
                <td>Random forest</td>
                <td>Naïve Bayes</td>
                <td>Support vector machine</td>
                <td>K-nearest neighbor</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Accuracy, %</td>
                <td>83.32</td>
                <td>83.38</td>
                <td>82.56 </td>
                <td>83.39</td>
                <td>81.25</td>
              </tr>
              <tr valign="top">
                <td>Sensitivity, %</td>
                <td>65.00</td>
                <td>65.00</td>
                <td>63.00</td>
                <td>66.00</td>
                <td>64.05</td>
              </tr>
              <tr valign="top">
                <td>Specificity, %</td>
                <td>79.00</td>
                <td>79.00</td>
                <td>77.00</td>
                <td>78.00</td>
                <td>72.05</td>
              </tr>
              <tr valign="top">
                <td>F1 score, %</td>
                <td>67.00</td>
                <td>67.00</td>
                <td>66.00</td>
                <td>69.00</td>
                <td>66.05</td>
              </tr>
              <tr valign="top">
                <td>Correctly classified observations (true positive), n</td>
                <td>91,602</td>
                <td>91,602</td>
                <td>90,370</td>
                <td>91,602</td>
                <td>89,326</td>
              </tr>
              <tr valign="top">
                <td>Incorrectly classified observations (false negative), n</td>
                <td>18,331</td>
                <td>18,331</td>
                <td>19,163</td>
                <td>18,331</td>
                <td>20,607</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Important Variables Influencing the TB Detection Performance of the Rats</title>
        <p>This study used the random forest variable importance function to output the predictor variables based on the mean decrease in Gini (impurity). Random forest showed high performance in the feature ranking. The mean decrease in the Gini value is the average (mean) of a variable’s total decrease in the likelihood of incorrect classification of a new instance of a random variable from the data set. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> shows the predicted variable importance based on the mean decrease in the Gini value using the random forest algorithm.</p>
        <p>From <xref ref-type="table" rid="table5">Table 5</xref>, higher (0.817152) and lower (0.026657) mean decreases in the Gini value result in greater and less variable importance, respectively. In other words, TB_Status and weight were the most and least significant variables, respectively, for predicting rat TB detection accuracy. However, for easy interpretation and visualization of these results, the variable importance function of the random forest algorithm sorted and displayed the variables as reported in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> based on the prediction importance. As such, the variable that contributed most to the prediction had the highest mean decrease in Gini values, followed by the variables with less importance.</p>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Random forest variable importance based on the mean decrease in the Gini value.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="400"/>
            <col width="400"/>
            <thead>
              <tr valign="top">
                <td>Variable</td>
                <td>Variable name</td>
                <td>Mean decrease in the Gini value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>0</td>
                <td>TB_Status</td>
                <td>0.817152</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>Age</td>
                <td>0.156190</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>Weight</td>
                <td>0.026657</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Algorithm for the Prediction of Rat TB Detection Performance</title>
        <p>The study also employed a prediction algorithm for TB detection as illustrated in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <p><xref ref-type="boxed-text" rid="box1">Textbox 1</xref> shows the algorithm that predicts if the rat will hit the sample or not. First, data were imported and normalized to acquire the required data format. Then, the statistical summary of the independent variables used to build predictive models was described. Considering <xref rid="figure3" ref-type="fig">Figure 3</xref>, the train_test_split library was used to divide the data set into training data (70%) for developing the models and test data (30%) for validating the models. The predictive models were trained based on the decision tree, random forest, naïve Bayes, SVM, and kNN classifiers, using the train data. Meanwhile, the validation of the models was performed using the test data. Then, accuracy, sensitivity, specificity, and F1 score were used to measure the classification performance of each classifier, as reported in <xref ref-type="table" rid="table4">Table 4</xref>. Furthermore, the input variables TB_Status, age, and weight were entered for prediction. Following the prediction, models were validated using the test data. Hence, data visualization was performed using the Matplotlib library for proper interpretation of the results. On the other hand, if the constraints were not met, the algorithm could be terminated.</p>
        <p>In addition to the above algorithm for the prediction of rat TB detection performance, <xref rid="figure3" ref-type="fig">Figure 3</xref> indicates the process flow of ML models and their predictions using <italic>Python</italic> libraries. The TB input data set was imported as a .csv file. After preprocessing the data, the <italic>sklearn model selection</italic> library was used to partition the data into training data (70%) and test data (30%) by using a simple random split method. The training data were used to build a predictive model using decision tree, random forest, naïve Bayes, SVM, and kNN classifiers. After building the predictive model, the inputs, including TB_Status, age, and weight, were computed to predict if the rat would hit the sample or not. Thereafter, the predictive models were evaluated for their prediction performance using accuracy, sensitivity, specificity, and recall metrics.</p>
        <boxed-text id="box1" position="float">
          <title>Algorithm for the prediction of rat tuberculosis detection performance.</title>
          <p>I. Import and normalize the dataset (.csv)</p>
          <p>II. Calculate IQR, mean, SD, minimum, and maximum</p>
          <p>III. Perform splitting of the data set</p>
          <p>1. if splitting is successful and not any constraints then</p>
          <p>- train the model</p>
          <p>2. Perform machine learning (ML) modeling based on decision tree, random forest, naïve Bayes, support vector machine, and k-nearest neighbor</p>
          <p>3. Perform validation of the ML modeling</p>
          <p>4. Perform ML model prediction</p>
          <p>5. Validate the prediction model by calculating <italic>accuracy</italic>, <italic>sensitivity</italic>, <italic>specificity</italic>, and <italic>F1 score</italic></p>
          <p>6. if <italic>accuracy and other parameters are good</italic> then</p>
          <p>- input: <italic>TB_Status</italic>, <italic>age</italic>, <italic>weight</italic></p>
          <p>7. Perform ML model prediction if the rat would hit the sample or not</p>
          <p>8. Update the predicted value of new data for reporting</p>
          <p>9. Make data visualization in Python</p>
          <p>10. else</p>
          <p>11. Perform termination check</p>
          <p>12. else</p>
          <p>13. End</p>
        </boxed-text>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The aim of this study was to build on the prior work of Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>] to develop models that predict if a trained TB-detection rat would hit (indicate the presence of TB within) a patient sample or not using ML techniques by incorporating variables related to the diagnostic results of the TB samples. This study used decision tree, random forest, naïve Bayes, SVM, and kNN ML techniques to build predictive models. The ML techniques successfully categorized the data by assigning a label to each computed data point. The results revealed that for the 5 different algorithms used, the classification accuracy was the greatest for SVM, suggesting its superiority to the decision tree, random forest, naïve Bayes, and kNN classifiers. The SVM classifier outperformed by yielding a classification accuracy of about 83.39% for predicting if the rat would hit the sample or not. This level of accuracy surpasses the 78.82% accuracy found with decision tree and naïve Bayes by Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>], suggesting that the inclusion of sample information serves as a valuable variable that influences the performance of TB-detection rats and improves the accuracy of the prediction models. Moreover, Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>] employed a small amount of data compared to the data used in this study. In fact, TB_Status was found to be the most significant variable in predicting rat TB detection performance. However, there was an insignificant accuracy difference between the constructed models and those created by Jonathan et al [<xref ref-type="bibr" rid="ref10">10</xref>], which could be due to the characteristics of the data [<xref ref-type="bibr" rid="ref16">16</xref>]. Therefore, the additional variables are likely to influence rat behavior, and the true status of patient samples can only be determined by available diagnostics.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>This study has shown the usefulness of ML techniques to identify factors that influence TB detection performance of rats. The techniques used were decision tree, random forest, naïve Bayes, SVM, and kNN to develop models that predict if the rat would hit the sample or not by incorporating valuable variables related to TB detection performance of rats. The performance of the predictive models was measured by accuracy, sensitivity, specificity, and F1 score metrics. The results showed that the SVM predictive model outperformed in the classification and prediction of the performance of rats in TB detection by yielding the highest accuracy of 83.39%. Furthermore, the obtained results suggest that the inclusion of variables related to the diagnostic results of TB samples improves the performance of the predictive models. Therefore, the results might benefit TB-detection rat trainers and TB decision-makers in improving the diagnostic accuracy of rats by predicting if a trained TB-detection rat would hit a patient sample or not. They can adopt several measures, including ensuring that all hit samples are confirmed within APOPO’s laboratory (ID_BL_APOPO). Furthermore, taking into consideration that the age of the rat at hit and clinic diagnostic results are predictors of detection performance.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Distribution of continuous independent variables.</p>
        <media xlink:href="ojphi_v16i1e50771_app1.png" xlink:title="PNG File , 20 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Random forest variable importance plot.</p>
        <media xlink:href="ojphi_v16i1e50771_app2.png" xlink:title="PNG File , 16 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">APOPO</term>
          <def>
            <p>Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DOTS</term>
          <def>
            <p>directly-observed treatment, short-course</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">FN</term>
          <def>
            <p>false negative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">FP</term>
          <def>
            <p>false positive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">kNN</term>
          <def>
            <p>k-nearest neighbor</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">SDT</term>
          <def>
            <p>Signal Detection Theory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SUA</term>
          <def>
            <p>Sokoine University of Agriculture</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">TB</term>
          <def>
            <p>tuberculosis</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">TN</term>
          <def>
            <p>true negative</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">TP</term>
          <def>
            <p>true positive</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">WHO</term>
          <def>
            <p>World Health Organization</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>We thank all reviewers who provided their insightful comments and suggestions for the improvement of the study and thank the APOPO (Anti-Persoonsmijnen Ontmijnende Product Ontwikkeling) TB Training and Research Center in Morogoro that provided data for this study. This work was supported (grant number CC003) in part by the Government of Tanzania through the Research and Innovation Grants of the Sokoine University of Agriculture (SUA). All authors declared that they had insufficient or no funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this article.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ellis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mulder</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Valverde</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Poling</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Edwards</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Reproducibility of African giant pouched rats detecting Mycobacterium tuberculosis</article-title>
          <source>BMC Infect Dis</source>
          <year>2017</year>
          <month>04</month>
          <day>24</day>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>298</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcinfectdis.biomedcentral.com/articles/10.1186/s12879-017-2347-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12879-017-2347-3</pub-id>
          <pub-id pub-id-type="medline">28438117</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12879-017-2347-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5402322</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Poling</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Weetjens</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Beyene</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Durgin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mahoney</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Tuberculosis detection by giant African pouched rats</article-title>
          <source>BEHAV ANALYST</source>
          <year>2017</year>
          <month>6</month>
          <day>1</day>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>47</fpage>
          <lpage>54</lpage>
          <pub-id pub-id-type="doi">10.1007/bf03392234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <article-title>APOPO Annual Report, 2022</article-title>
          <source>APOPO</source>
          <access-date>2024-01-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://apopo.org/wp-content/uploads/2023/06/APOPO_annual-report-2022-final.pdf">https://apopo.org/wp-content/uploads/2023/06/APOPO_annual-report-2022-final.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beyene</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mahoney</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Weetjens</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Makingi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mgode</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Durgin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kuipers</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jubitana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Egwaga</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kamara</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Lwilla</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Mfinanga</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Kahwa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Machang'u</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kazwala</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reither</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kaufmann</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Poling</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>APOPO's tuberculosis research agenda: achievements, challenges and prospects</article-title>
          <source>Tanzan J Health Res</source>
          <year>2012</year>
          <month>04</month>
          <day>24</day>
          <volume>14</volume>
          <issue>2</issue>
          <fpage>121</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="doi">10.4314/thrb.v14i2.5</pub-id>
          <pub-id pub-id-type="medline">26591733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Global tuberculosis report 2021</article-title>
          <source>World Health Organization</source>
          <access-date>2022-04-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/publications/i/item/9789240037021">https://www.who.int/publications/i/item/9789240037021</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fiebig</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Beyene</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Burny</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fast</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mgode</surname>
              <given-names>GF</given-names>
            </name>
          </person-group>
          <article-title>From pests to tests: training rats to diagnose tuberculosis</article-title>
          <source>Eur Respir J</source>
          <year>2020</year>
          <month>03</month>
          <day>20</day>
          <volume>55</volume>
          <issue>3</issue>
          <fpage>1902243</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://erj.ersjournals.com/cgi/pmidlookup?view=long&amp;pmid=32198268"/>
          </comment>
          <pub-id pub-id-type="doi">10.1183/13993003.02243-2019</pub-id>
          <pub-id pub-id-type="medline">32198268</pub-id>
          <pub-id pub-id-type="pii">55/3/1902243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>Manual for Management of Tuberculosis and Leprosy in Tanzania</article-title>
          <source>National Tuberculosis and Leprosy Programme (NTLP)</source>
          <access-date>2022-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ntlp.go.tz/site/assets/files/1081/ntlp_manual_2020_2021_1.pdf">https://ntlp.go.tz/site/assets/files/1081/ntlp_manual_2020_2021_1.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matthen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>The Oxford Handbook of Philosophy of Perception</source>
          <year>2015</year>
          <publisher-loc>Oxford, England</publisher-loc>
          <publisher-name>Oxford University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Green</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Swets</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <source>Signal detection theory and psychophysics</source>
          <year>1966</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>John Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jonathan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sanga</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mwita</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mgode</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Visual Analytics of Tuberculosis Detection Rat Performance</article-title>
          <source>Online J Public Health Inform</source>
          <year>2021</year>
          <month>09</month>
          <day>08</day>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>e12</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.5210/ojphi.v13i2.11465"/>
          </comment>
          <pub-id pub-id-type="doi">10.5210/ojphi.v13i2.11465</pub-id>
          <pub-id pub-id-type="medline">34659646</pub-id>
          <pub-id pub-id-type="pii">ojphi-13-e12</pub-id>
          <pub-id pub-id-type="pmcid">PMC8500793</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mgode</surname>
              <given-names>GF</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Mwimanzi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mulder</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Pediatric tuberculosis detection using trained African giant pouched rats</article-title>
          <source>Pediatr Res</source>
          <year>2018</year>
          <month>07</month>
          <day>4</day>
          <volume>84</volume>
          <issue>1</issue>
          <fpage>99</fpage>
          <lpage>103</lpage>
          <pub-id pub-id-type="doi">10.1038/pr.2018.40</pub-id>
          <pub-id pub-id-type="medline">29617007</pub-id>
          <pub-id pub-id-type="pii">10.1038/pr.2018.40</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vasilakos</surname>
              <given-names>AV</given-names>
            </name>
          </person-group>
          <article-title>Machine learning on big data: Opportunities and challenges</article-title>
          <source>Neurocomputing</source>
          <year>2017</year>
          <month>05</month>
          <volume>237</volume>
          <fpage>350</fpage>
          <lpage>361</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2017.01.026</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mirbabaie</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stieglitz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Frick</surname>
              <given-names>NRJ</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in disease diagnostics: A critical review and classification on the current state of research guiding future direction</article-title>
          <source>Health Technol</source>
          <year>2021</year>
          <month>05</month>
          <day>10</day>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>693</fpage>
          <lpage>731</lpage>
          <pub-id pub-id-type="doi">10.1007/s12553-021-00555-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zeeshan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence with multi-functional machine learning platform development for better healthcare and precision medicine</article-title>
          <source>Database (Oxford)</source>
          <year>2020</year>
          <month>01</month>
          <day>01</day>
          <volume>2020</volume>
          <fpage>baaa010</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32185396"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/database/baaa010</pub-id>
          <pub-id pub-id-type="medline">32185396</pub-id>
          <pub-id pub-id-type="pii">5809229</pub-id>
          <pub-id pub-id-type="pmcid">PMC7078068</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mehta</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Pandit</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Concurrence of big data analytics and healthcare: A systematic review</article-title>
          <source>Int J Med Inform</source>
          <year>2018</year>
          <month>06</month>
          <volume>114</volume>
          <fpage>57</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2018.03.013</pub-id>
          <pub-id pub-id-type="medline">29673604</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(18)30246-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharda</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Delen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Turban</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Aronson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Business Intelligence and Analytics: System for Decision Support</source>
          <year>2014</year>
          <publisher-loc>London, England</publisher-loc>
          <publisher-name>Pearson Education</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in healthcare: past, present and future</article-title>
          <source>Stroke Vasc Neurol</source>
          <year>2017</year>
          <month>12</month>
          <volume>2</volume>
          <issue>4</issue>
          <fpage>230</fpage>
          <lpage>243</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://svn.bmj.com/lookup/pmidlookup?view=long&amp;pmid=29507784"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/svn-2017-000101</pub-id>
          <pub-id pub-id-type="medline">29507784</pub-id>
          <pub-id pub-id-type="pii">svn-2017-000101</pub-id>
          <pub-id pub-id-type="pmcid">PMC5829945</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zomorodi-Moghadam</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Performance analysis of classification algorithms on early detection of liver disease</article-title>
          <source>Expert Systems with Applications</source>
          <year>2017</year>
          <month>01</month>
          <volume>67</volume>
          <fpage>239</fpage>
          <lpage>251</lpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2016.08.065</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ba</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Automatic detection of mycobacterium tuberculosis using artificial intelligence</article-title>
          <source>J Thorac Dis</source>
          <year>2018</year>
          <month>03</month>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>1936</fpage>
          <lpage>1940</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29707349"/>
          </comment>
          <pub-id pub-id-type="doi">10.21037/jtd.2018.01.91</pub-id>
          <pub-id pub-id-type="medline">29707349</pub-id>
          <pub-id pub-id-type="pii">jtd-10-03-1936</pub-id>
          <pub-id pub-id-type="pmcid">PMC5906344</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hrizi</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Gasmi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Ltaifa</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Alshammari</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Karamti</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Krichen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ben Ammar</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Tuberculosis Disease Diagnosis Based on an Optimized Machine Learning Model</article-title>
          <source>J Healthc Eng</source>
          <year>2022</year>
          <month>3</month>
          <day>21</day>
          <volume>2022</volume>
          <fpage>8950243</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2022/8950243"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2022/8950243</pub-id>
          <pub-id pub-id-type="medline">35494520</pub-id>
          <pub-id pub-id-type="pmcid">PMC9041161</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Asha</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Murthy</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Effective classification algorithms to predict the accuracy of tuberculosis-A machine learning approach</article-title>
          <source>International Journal of Computer Science and Information Security</source>
          <year>2011</year>
          <volume>9</volume>
          <issue>7</issue>
          <fpage>89</fpage>
          <lpage>94</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.researchgate.net/publication/303917794_Effective_Classification_Algorithms_to_Predict_the_Accuracy_of_Tuberculosis-_A_Machine_Learning_Approach"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Trihartati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Adi</surname>
              <given-names>CK</given-names>
            </name>
          </person-group>
          <article-title>An Identification of Tuberculosis (TB) Disease in Humans using Naïve Bayesian Method</article-title>
          <source>Scientific Journal of Informatics</source>
          <year>2016</year>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>99</fpage>
          <lpage>108</lpage>
          <pub-id pub-id-type="doi">10.15294/sji.v3i2.7918</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soni</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ahirwar</surname>
              <given-names>SK</given-names>
            </name>
          </person-group>
          <article-title>Mycobacterium Tuberculosis Detection using Support Vector Machine Classification Approach</article-title>
          <year>2021</year>
          <conf-name>10th IEEE International Conference on Communication Systems and Network Technologies (CSNT)</conf-name>
          <conf-date>June 18-19, 2021</conf-date>
          <conf-loc>Bhopal, India</conf-loc>
          <fpage>408</fpage>
          <lpage>413</lpage>
          <pub-id pub-id-type="doi">10.1109/CSNT51715.2021.9509635</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Hachimi</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Belaqziz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Khabba</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chehbouni</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Data Science Toolkit: An all-in-one python library to help researchers and practitioners in implementing data science-related algorithms with less effort</article-title>
          <source>Software Impacts</source>
          <year>2022</year>
          <month>05</month>
          <volume>12</volume>
          <fpage>100240</fpage>
          <pub-id pub-id-type="doi">10.1016/j.simpa.2022.100240</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Asif</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nishat</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Faisal</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Dip</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Udoy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shikder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ahsan</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance Evaluation and Comparative Analysis of Different Machine Learning Algorithms in Predicting Cardiovascular Disease</article-title>
          <source>Engineering Letters</source>
          <year>2021</year>
          <volume>29</volume>
          <issue>2</issue>
          <fpage>EL_29_2_42</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.engineeringletters.com/issues_v29/issue_2/EL_29_2_42.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reither</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jugheli</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sasamalo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mhimbira</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weetjens</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Edwards</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mulder</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Beyene</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mahoney</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of Giant African Pouched Rats for Detection of Pulmonary Tuberculosis in Patients from a High-Endemic Setting</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>10</issue>
          <fpage>e0135877</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0135877"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0135877</pub-id>
          <pub-id pub-id-type="medline">26445086</pub-id>
          <pub-id pub-id-type="pii">PONE-D-15-14974</pub-id>
          <pub-id pub-id-type="pmcid">PMC4596709</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
