<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
 <record>
  <leader>04387ntm a22005897i 4500</leader>
  <controlfield tag="001">000716613</controlfield>
  <controlfield tag="003">CZ-PrVSE</controlfield>
  <controlfield tag="005">20240824093620.0</controlfield>
  <controlfield tag="006">m        d</controlfield>
  <controlfield tag="007">cr n||||||||||</controlfield>
  <controlfield tag="008">240824s2024    xr     fsbm   000 0 eng d</controlfield>
  <datafield tag="STA" ind1=" " ind2=" ">
   <subfield code="a">NEZPRACOVANÝ IMPORT</subfield>
  </datafield>
  <datafield tag="040" ind1=" " ind2=" ">
   <subfield code="a">ABA006</subfield>
   <subfield code="b">cze</subfield>
   <subfield code="c">ABA006</subfield>
   <subfield code="d">ABA006</subfield>
   <subfield code="e">rda</subfield>
  </datafield>
  <datafield tag="100" ind1="1" ind2=" ">
   <subfield code="a">Balek, Vojtěch</subfield>
   <subfield code="%">ISIS:155574</subfield>
   <subfield code="4">dis</subfield>
  </datafield>
  <datafield tag="242" ind1="1" ind2="0">
   <subfield code="a">Velké jazykové modely jako nástroj pro extrakci rysů z textu</subfield>
   <subfield code="y">eng</subfield>
  </datafield>
  <datafield tag="245" ind1="1" ind2="0">
   <subfield code="a">Large Language Models as a tool for generating high-level features for text documents /</subfield>
   <subfield code="c">Vojtěch Balek</subfield>
  </datafield>
  <datafield tag="264" ind1=" " ind2="0">
   <subfield code="c">2024</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
   <subfield code="a">?? stran :</subfield>
   <subfield code="3">digital, PDF soubor</subfield>
  </datafield>
  <datafield tag="500" ind1=" " ind2=" ">
   <subfield code="a">Vedoucí práce: Tomáš Kliegr</subfield>
  </datafield>
  <datafield tag="502" ind1=" " ind2=" ">
   <subfield code="a">Bakalářská práce (Bc.)—Vysoká škola ekonomická v Praze. Fakulta informatiky a statistiky, 2024</subfield>
  </datafield>
  <datafield tag="504" ind1=" " ind2=" ">
   <subfield code="a">Obsahuje bibliografii</subfield>
  </datafield>
  <datafield tag="516" ind1=" " ind2=" ">
   <subfield code="a">Textový (vysokoškolská kvalifikační práce)</subfield>
  </datafield>
  <datafield tag="518" ind1=" " ind2=" ">
   <subfield code="a">Rok obhajoby 2024</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
   <subfield code="a">This bachelor thesis investigates the usability of large language models (LLMs) for feature generation from text, evaluating whether LLMs can produce interpretable and usable features for machine-learning tasks. The study uses two labeled datasets: the CORD-19 corpus, consisting of coronavirus research articles with binary labels for high and low citation count, and a dataset of scientific articles from Czech research institutions, with article scores assigned according to the M17+ methodology (ranging from 1 to 5). Seven categorical features were generated for each dataset using the LLama2 language model. These features were used to train models for binary and ordinal classification tasks. Performance was compared to baseline naive models and models trained on term frequency-inverse document frequency (TF-IDF) and sentence embeddings. In the CORD-19 dataset, models using LLM-generated features achieved an accuracy of 59.8%, outperforming the baseline dummy classifier (50.2%) but falling short of TF-IDF (62.5%) and sentence embeddings (62.5%). Combining LLM-generated features with article abstract and title texts using the AutoGluon platform achieved the highest accuracy (66.5%), followed by combining TF-IDF terms and LLM-generated features (65.3%). For the M17+ dataset, the model using LLM-generated features attained an accuracy of 37%, surpassing the naive classifier (18%) and TF-IDF (34.3%). Sentence embeddings achieved the highest accuracy (40.8%), while the AutoGluon model trained on abstract and title text achieved 39.5%. LLM-generated features enhanced the predictive performance of models and demonstrated higher interpretability compared to traditional bibliometric features. However, a notable limitation is the computational cost; generating features for small datasets (2000-3000 samples) requires tens of hours on high-end hardware.</subfield>
  </datafield>
  <datafield tag="538" ind1=" " ind2=" ">
   <subfield code="a">Způsob přístupu: Internet</subfield>
  </datafield>
  <datafield tag="653" ind1="0" ind2=" ">
   <subfield code="a">data analytics [obor bakal. práce]</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="7">
   <subfield code="a">bakalářské práce</subfield>
   <subfield code="7">fd132403</subfield>
   <subfield code="2">czenas</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="9">
   <subfield code="a">bachelor's theses</subfield>
   <subfield code="2">eczenas</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">classification</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">feature importance</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">feature extraction</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">interpretability</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">large language models</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Kliegr, Tomáš</subfield>
   <subfield code="%">ISIS:8484</subfield>
   <subfield code="4">ths</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Svátek, Vojtěch,</subfield>
   <subfield code="d">1967 prosinec 1.-</subfield>
   <subfield code="7">mzk2004217940</subfield>
   <subfield code="4">opn</subfield>
  </datafield>
  <datafield tag="710" ind1="2" ind2=" ">
   <subfield code="a">Vysoká škola ekonomická v Praze.</subfield>
   <subfield code="b">Fakulta informatiky a statistiky</subfield>
   <subfield code="7">kn20010709399</subfield>
   <subfield code="4">dgg</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/podrobnosti</subfield>
   <subfield code="y">VŠKP v InSIS</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858</subfield>
   <subfield code="y">Hlavní práce</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/posudek/vedouci</subfield>
   <subfield code="y">Hodnocení vedoucího</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/posudek/oponent/83889</subfield>
   <subfield code="y">Oponentura</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29367</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29368</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29369</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29370</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29371</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/priloha/29372</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/86858/podrobnosti</subfield>
   <subfield code="y">dc:identifier</subfield>
  </datafield>
  <datafield tag="993" ind1=" " ind2=" ">
   <subfield code="x">NEPOSILAT</subfield>
   <subfield code="y">VSKP</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="9">
   <subfield code="a">vse86858</subfield>
   <subfield code="b">240823</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="5">
   <subfield code="x">86858</subfield>
  </datafield>
 </record>
</collection>
