BibTeX

choudhury-heuristics-jcdl20

Choudhury, Muntabir Hasan, Wu, Jian, Ingram, William A., and Fox, Edward A., “A Heuristic Baseline Method for Metadata Extraction from Scanned Electronic Theses and Dissertations,” In Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020. New York, NY, USA 2020. Association for Computing Machinery, p. 515–516.

Abstract:

Extracting metadata from scholarly papers is an important text mining problem. Widely used open-source tools such as GROBID are designed for born-digital scholarly papers but often fail for scanned documents, such as Electronic Theses and Dissertations (ETDs). Here we present a preliminary baseline work with a heuristic model to extract metadata from the cover pages of scanned ETDs. The process started with converting scanned pages into images and then text files by applying OCR tools. Then a series of carefully designed regular expressions for each field is applied, capturing patterns for seven metadata fields: titles, authors, years, degrees, academic programs, institutions, and advisors. The method is evaluated on a ground truth dataset comprised of rectified metadata provided by the Virginia Tech and MIT libraries. Our heuristic method achieves an accuracy of up to 97% on the fields of the ETD text files. Our method poses a strong baseline for machine learning based methods. To our best knowledge, this is the first work attempting to extract metadata from non-born-digital ETDs.

BibTeX entry:

@INPROCEEDINGS {choudhury-heuristics-jcdl20,
    author = {Choudhury, Muntabir Hasan and Wu, Jian and Ingram, William A. and Fox, Edward A.},
    title = {A Heuristic Baseline Method for Metadata Extraction from Scanned Electronic Theses and Dissertations},
    year = {2020},
    isbn = {9781450375856},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    url = {https://doi.org/10.1145/3383583.3398590},
    doi = {10.1145/3383583.3398590},
    abstract = {Extracting metadata from scholarly papers is an important text mining problem. Widely used open-source tools such as GROBID are designed for born-digital scholarly papers but often fail for scanned documents, such as Electronic Theses and Dissertations (ETDs). Here we present a preliminary baseline work with a heuristic model to extract metadata from the cover pages of scanned ETDs. The process started with converting scanned pages into images and then text files by applying OCR tools. Then a series of carefully designed regular expressions for each field is applied, capturing patterns for seven metadata fields: titles, authors, years, degrees, academic programs, institutions, and advisors. The method is evaluated on a ground truth dataset comprised of rectified metadata provided by the Virginia Tech and MIT libraries. Our heuristic method achieves an accuracy of up to 97\% on the fields of the ETD text files. Our method poses a strong baseline for machine learning based methods. To our best knowledge, this is the first work attempting to extract metadata from non-born-digital ETDs.},
    booktitle = {Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020},
    pages = {515–516},
    numpages = {2},
    keywords = {digital libraries, heuristic method, metadata extraction, optical character recognition (OCR), text mining},
    location = {Virtual Event, China},
    series = {JCDL '20}
}

choudhury-autometa-jcdl21

Hasan Choudhury, Muntabir, Jayanetti, Himarsha R., Wu, Jian, Ingram, William A., and Fox, Edward A., “Automatic Metadata Extraction Incorporating Visual Features from Scanned Electronic Theses and Dissertations,” In 2021 ACM/IEEE Joint Conference on Digital Libraries (JCDL). 2021, pp. 230-233.

BibTeX entry:

@INPROCEEDINGS {choudhury-autometa-jcdl21,
      author={Hasan Choudhury, Muntabir and Jayanetti, Himarsha R. and Wu, Jian and Ingram, William A. and Fox, Edward A.},
      booktitle={2021 ACM/IEEE Joint Conference on Digital Libraries (JCDL)},
       title={Automatic Metadata Extraction Incorporating Visual Features from Scanned Electronic Theses and Dissertations},
       year={2021},
      volume={},
      number={},
      pages={230-233},
      keywords={Representation learning;Visualization;Metadata;Tagging;Search engines;Feature extraction;Libraries;Digital Libraries;Optical Character Recognition;Text Mining;Metadata Extraction;CRF;BiLSTM},
      doi={10.1109/JCDL52503.2021.00066}
}

reshad-aaai@sdu22author = {Hoque, M. R. U., Wei, X., Choudhury, M. H., Ajayi, K., Gryder, M., Wu, J., & Oyen, D.}

, “Segmenting Technical Drawing Figures in US Patents,” CEUR Workshop Proceedings- SDU 2022 Proceedings of the Workshop on Scientific Document Understanding 2022.

BibTeX entry:

@MISC {reshad-aaai@sdu22author = {Hoque, M. R. U., Wei, X., Choudhury, M. H., Ajayi, K., Gryder, M., Wu, J., & Oyen, D.},
    title = {Segmenting Technical Drawing Figures in US Patents},
    year={2022},
    howpublished = {CEUR Workshop Proceedings- SDU 2022 Proceedings of the Workshop on Scientific Document Understanding},
    url = {http://ceur-ws.org/Vol-3164/paper28.pdf}
}

salsabil-SciK22

Salsabil, Lamia, Wu, Jian, Choudhury, Muntabir Hasan, Ingram, William A., Fox, Edward A., Rajtmajer, Sarah M., and Giles, C. Lee, “A Study of Computational Reproducibility using URLs Linking to Open Access Datasets and Software,” In Companion Proceedings of the Web Conference 2022. New York, NY, USA 2022. Association for Computing Machinery, p. 784–788.

Abstract:

Datasets and software packages are considered important resources that can be used for replicating computational experiments. With the advocacy of Open Science and the growing interest of investigating reproducibility of scientific claims, including URLs linking to publicly available datasets and software packages has become an institutionalized part of research publications. In this preliminary study, we investigated the disciplinary dependency and chronological trends of including open access datasets and software (OADS) in electronic theses and dissertations (ETDs), based on a hybrid classifier called OADSClassifier, consisting of a heuristic and a supervised learning model. The classifier achieves the best F1 of 0.92. We found that the inclusion of OADS-URLs exhibited a strong disciplinary dependence and the fraction of ETDs containing OADS-URLs has been gradually increasing over the past 20 years. We developed and share a ground truth corpus consisting of 500 manually labeled sentences containing URLs from scientific papers. The dataset and source code are available at https://github.com/lamps-lab/oadsclassifier.

BibTeX entry:

@INPROCEEDINGS {salsabil-SciK22,
    author = {Salsabil, Lamia and Wu, Jian and Choudhury, Muntabir Hasan and Ingram, William A. and Fox, Edward A. and Rajtmajer, Sarah M. and Giles, C. Lee},
    title = {A Study of Computational Reproducibility using URLs Linking to Open Access Datasets and Software},
    year = {2022},
    isbn = {9781450391306},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    url = {https://doi.org/10.1145/3487553.3524658},
    doi = {10.1145/3487553.3524658},
    abstract = {Datasets and software packages are considered important resources that can be used for replicating computational experiments. With the advocacy of Open Science and the growing interest of investigating reproducibility of scientific claims, including URLs linking to publicly available datasets and software packages has become an institutionalized part of research publications. In this preliminary study, we investigated the disciplinary dependency and chronological trends of including open access datasets and software (OADS) in electronic theses and dissertations (ETDs), based on a hybrid classifier called OADSClassifier, consisting of a heuristic and a supervised learning model. The classifier achieves the best F1 of 0.92. We found that the inclusion of OADS-URLs exhibited a strong disciplinary dependence and the fraction of ETDs containing OADS-URLs has been gradually increasing over the past 20 years. We developed and share a ground truth corpus consisting of 500 manually labeled sentences containing URLs from scientific papers. The dataset and source code are available at https://github.com/lamps-lab/oadsclassifier.},
    booktitle = {Companion Proceedings of the Web Conference 2022},
    pages = {784–788},
    numpages = {5},
    keywords = {ETD, language model, open access, reproducibility},
    location = {Virtual Event, Lyon, France},
    series = {WWW '22}
}

ajayi-icdar23

Ajayi, “A Study on Reproducibility and Replicability of Table Structure Recognition Methods,” In Document Analysis and Recognition - ICDAR 2023. Cham 2023. (Fink, Eds.) Springer Nature Switzerland, pp. 3-19.

Abstract:

Concerns about reproducibility in artificial intelligence (AI) have emerged

BibTeX entry:

@INPROCEEDINGS {ajayi-icdar23,
    author="Ajayi,
     Kehindeand Choudhury,
     Muntabir Hasanand Rajtmajer,
     Sarah M.and Wu,
     Jian",
    title="A Study on Reproducibility and Replicability of Table Structure Recognition Methods",
    booktitle="Document Analysis and Recognition - ICDAR 2023",
    year="2023",
    publisher="Springer Nature Switzerland",
    address="Cham",
    pages="3--19",
    abstract="Concerns about reproducibility in artificial intelligence (AI) have emerged,
     as researchers have reported unsuccessful attempts to directly reproduce published findings in the field. Replicability,
     the ability to affirm a finding using the same procedures on new data,
     has not been well studied. In this paper,
     we examine both reproducibility and replicability of a corpus of 16 papers on table structure recognition (TSR),
     an AI task aimed at identifying cell locations of tables in digital documents. We attempt to reproduce published results using codes and datasets provided by the original authors. We then examine replicability using a dataset similar to the original as well as a new dataset,
     GenTSR,
     consisting of 386 annotated tables extracted from scientific papers. Out of 16 papers studied,
     we reproduce results consistent with the original in only four. Two of the four papers are identified as replicable using the similar dataset under certain IoU values. No paper is identified as replicable using the new dataset. We offer observations on the causes of irreproducibility and irreplicability. All code and data are available on Codeocean at https://codeocean.com/capsule/6680116/tree.",
    isbn="978-3-031-41679-8"
}

choudhury-jcdl23

Choudhury, Muntabir Hasan, Salsabil, Lamia, Jayanetti, Himarsha R., Wu, Jian, Ingram, William A., and Fox, Edward A., “MetaEnhance: Metadata Quality Improvement for Electronic Theses and Dissertations of University Libraries,” In 2023 ACM/IEEE Joint Conference on Digital Libraries (JCDL). 2023, pp. 61-65.

BibTeX entry:

@INPROCEEDINGS {choudhury-jcdl23,
      author={Choudhury, Muntabir Hasan and Salsabil, Lamia and Jayanetti, Himarsha R. and Wu, Jian and Ingram, William A. and Fox, Edward A.},
      booktitle={2023 ACM/IEEE Joint Conference on Digital Libraries (JCDL)},
       title={MetaEnhance: Metadata Quality Improvement for Electronic Theses and Dissertations of University Libraries},
       year={2023},
      volume={},
      number={},
      pages={61-65},
      keywords={Codes;Metadata;Benchmark testing;Libraries;Distance measurement;Artificial intelligence;Digital Libraries;Scholarly Big Data;ETD;Metadata Quality;Artificial Intelligence},
      doi={10.1109/JCDL57899.2023.00019} 
}

choudhury-iaai24

Choudhury, Muntabir Hasan, Salsabil, Lamia, Ingram, William A., Fox, Edward A., and Wu, Jian, “ETDPC: A Multimodality Framework for Classifying Pages in Electronic Theses and Dissertations,” Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 38, No. 21, Mar. 2024, pp. 22878-22884.

BibTeX entry:

@ARTICLE {choudhury-iaai24,
     title={ETDPC: A Multimodality Framework for Classifying Pages in Electronic Theses and Dissertations},
     volume={38},
     url={https://ojs.aaai.org/index.php/AAAI/article/view/30324},
     DOI={10.1609/aaai.v38i21.30324},
     abstractNote={Electronic theses and dissertations (ETDs) have been proposed, advocated, and generated for more than 25 years. Although ETDs are hosted by commercial or institutional digital library repositories, they are still an understudied type of scholarly big data, partially because they are usually longer than conference and journal papers. Segmenting ETDs will allow researchers to study sectional content. Readers can navigate to particular pages of interest, to discover and explore the content buried in these long documents. Most existing frameworks on document page classification are designed for classifying general documents, and perform poorly on ETDs. In this paper, we propose ETDPC. Its backbone is a two-stream multimodal model with a cross-attention network to classify ETD pages into 13 categories. To overcome the challenge of imbalanced labeled samples, we augmented data for minority categories and employed a hierarchical classifier. ETDPC outperforms the state-of-the-art models in all categories, achieving an F1 of 0.84 -- 0.96 for 9 out of 13 categories. We also demonstrated its data efficiency. The code and data can be found on GitHub (https://github.com/lamps-lab/ETDMiner/tree/master/etd_segmentation).},
     number={21},
     journal={Proceedings of the AAAI Conference on Artificial Intelligence},
     author={Choudhury, Muntabir Hasan and Salsabil, Lamia and Ingram, William A. and Fox, Edward A. and Wu, Jian},
     year={2024},
     month={Mar.},
     pages={22878-22884} 
}

ingram-ijdl24author = {Ingram, William A., Wu, Jian, Kahu, Sampanna Yashwant, Manzoor, Javaid Akbar, Banerjee, Bipasha, Ahuja, Aman, Choudhury, Muntabir Hasan, Salsabil, Lamia, Shields, Winston, Fox, Edward A.}

, “Building datasets to support information extraction and structure parsing from electronic theses and dissertations,” International Journal on Digital Libraries, june 2024.

BibTeX entry:

@ARTICLE {ingram-ijdl24author = {Ingram, William A., Wu, Jian, Kahu, Sampanna Yashwant, Manzoor, Javaid Akbar, Banerjee, Bipasha, Ahuja, Aman, Choudhury, Muntabir Hasan, Salsabil, Lamia, Shields, Winston, Fox, Edward A.},
    title = {Building datasets to support information extraction and structure parsing from electronic theses and dissertations},
    journal = {International Journal on Digital Libraries},
    year = {2024},
    month = {june},
    url = {https://doi.org/10.1007/s00799-024-00395-4}
}

choudhury-dissertation-2024author= {Choudhury, Muntabir H.}

, “ETDSuite: A Toolkit to Mine Electronic Theses and Dissertations to Enrich Scholarly Big Data Using Natural Language Processing and Computer Vision,” PhD thesis, Old Dominion University 2024.

BibTeX entry:

@PHDTHESIS {choudhury-dissertation-2024author= {Choudhury, Muntabir H.},
    title = {ETDSuite: A Toolkit to Mine Electronic Theses and Dissertations to Enrich Scholarly Big Data Using Natural Language Processing and Computer Vision},
    school= {Old Dominion University},
     year= {2024},
    url = {https://digitalcommons.odu.edu/computerscience_etds/184},
    doi= {10.25777/h6qt-1p64}
}