<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
 <record>
  <leader>03836ntm a22005537i 4500</leader>
  <controlfield tag="001">000259874</controlfield>
  <controlfield tag="003">CZ-PrVSE</controlfield>
  <controlfield tag="005">20220604071223.0</controlfield>
  <controlfield tag="006">m        d</controlfield>
  <controlfield tag="007">cr n||||||||||</controlfield>
  <controlfield tag="008">220604s2022    xr     fsbm   000 0 eng d</controlfield>
  <datafield tag="STA" ind1=" " ind2=" ">
   <subfield code="a">NEZPRACOVANÝ IMPORT</subfield>
  </datafield>
  <datafield tag="040" ind1=" " ind2=" ">
   <subfield code="a">ABA006</subfield>
   <subfield code="b">cze</subfield>
   <subfield code="c">ABA006</subfield>
   <subfield code="d">ABA006</subfield>
   <subfield code="e">rda</subfield>
  </datafield>
  <datafield tag="100" ind1="1" ind2=" ">
   <subfield code="a">Holub, Ondřej</subfield>
   <subfield code="%">ISIS:118860</subfield>
   <subfield code="4">dis</subfield>
  </datafield>
  <datafield tag="242" ind1="1" ind2="0">
   <subfield code="a">Analýza data lakehouse :</subfield>
   <subfield code="b">Nejnovější evoluce v big data architekturách a její přínosy pro data science</subfield>
   <subfield code="y">eng</subfield>
  </datafield>
  <datafield tag="245" ind1="1" ind2="0">
   <subfield code="a">Analyzing Data Lakehouse :</subfield>
   <subfield code="b">The Latest Evolution of Big Data Architectures and Its Benefits for Data Science</subfield>
   <subfield code="c">Ondřej Holub</subfield>
  </datafield>
  <datafield tag="264" ind1=" " ind2="0">
   <subfield code="c">2022</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
   <subfield code="a">?? stran :</subfield>
   <subfield code="3">digital, PDF soubor</subfield>
  </datafield>
  <datafield tag="500" ind1=" " ind2=" ">
   <subfield code="a">Vedoucí práce: Petr Máša</subfield>
  </datafield>
  <datafield tag="502" ind1=" " ind2=" ">
   <subfield code="a">Diplomová práce (Ing.)—Vysoká škola ekonomická v Praze. Fakulta informatiky a statistiky, 2022</subfield>
  </datafield>
  <datafield tag="504" ind1=" " ind2=" ">
   <subfield code="a">Obsahuje bibliografii</subfield>
  </datafield>
  <datafield tag="516" ind1=" " ind2=" ">
   <subfield code="a">Textový (vysokoškolská kvalifikační práce)</subfield>
  </datafield>
  <datafield tag="518" ind1=" " ind2=" ">
   <subfield code="a">Rok obhajoby 2022</subfield>
  </datafield>
  <datafield tag="520" ind1="3" ind2=" ">
   <subfield code="a">The data lakehouse is a new-generation distributed data analytics platform architecture that combines key benefits of data lakes and warehouses into a unified design, reconciling diverse data science and business intelligence workloads over a common data foundation. This thesis examines the novel data lakehouse architecture and a set of associated data preprocessing and data science technologies and demonstrates how data science projects can benefit from them. First, the author describes and analyzes the emerging lakehouse architectural pattern with a primary focus on data storage, integration, and preprocessing as an integral lifecycle stage for data and knowledge mining projects and methods such as machine learning. Second, the thesis describes modules of the Apache Spark ecosystem relevant for machine learning and streaming. Third, the author formulates a subset of challenges observed in current enterprise big data preprocessing and machine learning practice and verifies how integrating described technologies can lead to partially or fully answering such challenges and ultimately increase the business value gained from data science. This work presents a proof of concept implementation illustrating streaming data preprocessing, interactive machine learning feature engineering and modeling, and streaming predictions in the context of a simulated data science project, demonstrating the resolution of the defined data engineering challenges observed in the data science lifecycle, such as supporting data reproducibility and handling upserts, using distributed data lakehouse technologies in a cloud setting.</subfield>
  </datafield>
  <datafield tag="538" ind1=" " ind2=" ">
   <subfield code="a">Způsob přístupu: Internet</subfield>
  </datafield>
  <datafield tag="653" ind1="0" ind2=" ">
   <subfield code="a">znalostní a webové technologie [obor dipl. práce]</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="7">
   <subfield code="a">diplomové práce</subfield>
   <subfield code="7">fd132022</subfield>
   <subfield code="2">czenas</subfield>
  </datafield>
  <datafield tag="655" ind1=" " ind2="9">
   <subfield code="a">master's theses</subfield>
   <subfield code="2">eczenas</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">data lakehouse</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">data science</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">data engineering</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">big data</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">machine learning</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">Delta Lake</subfield>
  </datafield>
  <datafield tag="690" ind1=" " ind2=" ">
   <subfield code="a">Apache Spark</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Máša, Petr</subfield>
   <subfield code="%">ISIS:17194</subfield>
   <subfield code="4">ths</subfield>
  </datafield>
  <datafield tag="700" ind1="1" ind2=" ">
   <subfield code="a">Rauch, Jan,</subfield>
   <subfield code="d">1948-</subfield>
   <subfield code="7">jn20001227021</subfield>
   <subfield code="4">opn</subfield>
  </datafield>
  <datafield tag="710" ind1="2" ind2=" ">
   <subfield code="a">Vysoká škola ekonomická v Praze.</subfield>
   <subfield code="b">Fakulta informatiky a statistiky</subfield>
   <subfield code="7">kn20010709399</subfield>
   <subfield code="4">dgg</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024/podrobnosti</subfield>
   <subfield code="y">VŠKP v InSIS</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024</subfield>
   <subfield code="y">Hlavní práce</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024/posudek/vedouci</subfield>
   <subfield code="y">Hodnocení vedoucího</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024/posudek/oponent/74663</subfield>
   <subfield code="y">Oponentura</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024/priloha/23614</subfield>
   <subfield code="y">Přiloha k práci</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="0">
   <subfield code="u">https://insis.vse.cz/zp/75024/podrobnosti</subfield>
   <subfield code="y">dc:identifier</subfield>
  </datafield>
  <datafield tag="993" ind1=" " ind2=" ">
   <subfield code="x">NEPOSILAT</subfield>
   <subfield code="y">VSKP</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="9">
   <subfield code="a">vse75024</subfield>
   <subfield code="b">220602</subfield>
  </datafield>
  <datafield tag="999" ind1="4" ind2="5">
   <subfield code="x">75024</subfield>
  </datafield>
 </record>
</collection>
