Publikationen AS: Bibliographie 2024 BibTeX
@inproceedings {INPROC-2024-11,
author = {Michael Behringer and Dennis Treder-Tschechlov and Jannis Rapp},
title = {{Empowering Domain Experts to Enhance Clustering Results Through Interactive Refinement}},
booktitle = {Onizuka, M., et al. Database Systems for Advanced Applications. DASFAA 2024. Lecture Notes in Computer Science, vol 14856. Springer, Singapore.},
publisher = {Springer},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {518--522},
type = {Konferenz-Beitrag},
month = {September},
year = {2024},
doi = {https://doi.org/10.1007/978-981-97-5575-2_51},
language = {Deutsch},
cr-category = {I.5.3 Pattern Recognition Clustering},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Data mining is crucial to gain knowledge from large amounts of data. One
popular data mining technique is clustering aiming to group similar data
together. This technique relies on domain knowledge to interpret the results.
However, the initial results are often insufficient and must be refined -
taking tremendous time and resources with unclear benefits. In this demo paper,
we introduce our novel user-centric approach that supports domain expert in
interactively refining clustering results to their needs by merging and
splitting clusters, specifying constraints, or by applying active learning -
combined in one single tool.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-11&engl=0}
}
@inproceedings {INPROC-2024-10,
author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
editor = {Proceedings of the VLDB Endowment},
publisher = {ACM},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {2880--2892},
type = {Konferenz-Beitrag},
month = {August},
year = {2024},
doi = {https://doi.org/10.14778/3681954.3681970},
language = {Englisch},
cr-category = {I.5.3 Pattern Recognition Clustering},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
because they scale well for large datasets. However, they are only able to
detect simple data characteristics. Ensemble clustering can overcome this
limitation by combining multiple results of efficient algorithms. However,
analysts face several challenges when applying ensemble clustering, i. e.,
analysts struggle to (a) efficiently generate an ensemble and (b) combine the
ensemble using a suitable consensus function with a corresponding
hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
clustering approach to address these challenges. Our approach relies on
meta-learning to learn about dataset characteristics and the correlation
between generated base clusterings and the performance of consensus functions.
We apply the learned knowledge to generate appropriate ensembles and select a
suitable consensus function to combine their results. Further, we use a
state-of-the-art optimization technique to tune the hyperparameters of the
selected consensus function. Our comprehensive evaluation on synthetic and
real-world datasets demonstrates that EffEns significantly outperforms
state-of-the-art approaches w.r.t. accuracy and runtime.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-10&engl=0}
}
@inproceedings {INPROC-2024-09,
author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
editor = {Proceedings of the VLDB Endowment},
publisher = {ACM},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {2880--2892},
type = {Konferenz-Beitrag},
month = {August},
year = {2024},
doi = {https://doi.org/10.14778/3681954.3681970},
language = {Englisch},
cr-category = {I.5.3 Pattern Recognition Clustering},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
because they scale well for large datasets. However, they are only able to
detect simple data characteristics. Ensemble clustering can overcome this
limitation by combining multiple results of efficient algorithms. However,
analysts face several challenges when applying ensemble clustering, i. e.,
analysts struggle to (a) efficiently generate an ensemble and (b) combine the
ensemble using a suitable consensus function with a corresponding
hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
clustering approach to address these challenges. Our approach relies on
meta-learning to learn about dataset characteristics and the correlation
between generated base clusterings and the performance of consensus functions.
We apply the learned knowledge to generate appropriate ensembles and select a
suitable consensus function to combine their results. Further, we use a
state-of-the-art optimization technique to tune the hyperparameters of the
selected consensus function. Our comprehensive evaluation on synthetic and
real-world datasets demonstrates that EffEns significantly outperforms
state-of-the-art approaches w.r.t. accuracy and runtime.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-09&engl=0}
}
@inproceedings {INPROC-2024-08,
author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
editor = {Proceedings of the VLDB Endowment},
publisher = {ACM},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {2880--2892},
type = {Konferenz-Beitrag},
month = {August},
year = {2024},
doi = {https://doi.org/10.14778/3681954.3681970},
language = {Englisch},
cr-category = {I.5.3 Pattern Recognition Clustering},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
because they scale well for large datasets. However, they are only able to
detect simple data characteristics. Ensemble clustering can overcome this
limitation by combining multiple results of efficient algorithms. However,
analysts face several challenges when applying ensemble clustering, i. e.,
analysts struggle to (a) efficiently generate an ensemble and (b) combine the
ensemble using a suitable consensus function with a corresponding
hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
clustering approach to address these challenges. Our approach relies on
meta-learning to learn about dataset characteristics and the correlation
between generated base clusterings and the performance of consensus functions.
We apply the learned knowledge to generate appropriate ensembles and select a
suitable consensus function to combine their results. Further, we use a
state-of-the-art optimization technique to tune the hyperparameters of the
selected consensus function. Our comprehensive evaluation on synthetic and
real-world datasets demonstrates that EffEns significantly outperforms
state-of-the-art approaches w.r.t. accuracy and runtime.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-08&engl=0}
}
@inproceedings {INPROC-2024-07,
author = {Dennis Treder-Tschechlov and Manuel Fritz and Holger Schwarz and Bernhard Mitschang},
title = {{Ensemble Clustering based on Meta-Learning and Hyperparameter Optimization}},
booktitle = {Proc. VLDB Endow. 17, 11 (July 2024), 2880–2892.},
editor = {Proceedings of the VLDB Endowment},
publisher = {VLDB Endowment},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {2880--2892},
type = {Konferenz-Beitrag},
month = {August},
year = {2024},
language = {Englisch},
cr-category = {I.5.3 Pattern Recognition Clustering},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Efficient clustering algorithms, such as k-Means, are often used in practice
because they scale well for large datasets. However, they are only able to
detect simple data characteristics. Ensemble clustering can overcome this
limitation by combining multiple results of efficient algorithms. However,
analysts face several challenges when applying ensemble clustering, i. e.,
analysts struggle to (a) efficiently generate an ensemble and (b) combine the
ensemble using a suitable consensus function with a corresponding
hyperparameter setting. In this paper, we propose EffEns, an efficient ensemble
clustering approach to address these challenges. Our approach relies on
meta-learning to learn about dataset characteristics and the correlation
between generated base clusterings and the performance of consensus functions.
We apply the learned knowledge to generate appropriate ensembles and select a
suitable consensus function to combine their results. Further, we use a
state-of-the-art optimization technique to tune the hyperparameters of the
selected consensus function. Our comprehensive evaluation on synthetic and
real-world datasets demonstrates that EffEns significantly outperforms
state-of-the-art approaches w.r.t. accuracy and runtime.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-07&engl=0}
}
@inproceedings {INPROC-2024-06,
author = {Christoph Stach and Yunxuan Li and Laura Schuiki and Bernhard Mitschang},
title = {{LALO—A Virtual Data Lake Zone for Composing Tailor-Made Data Products on Demand}},
booktitle = {Proceedings of the 35th International Conference on Database and Expert Systems Applications (DEXA 2024)},
editor = {Christine Strauss and Toshiyuki Amagasa and Giuseppe Manco and Gabriele Kotsis and A Min Tjoa and Ismail Khalil},
address = {Cham},
publisher = {Springer},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
series = {Lecture Notes in Computer Science},
volume = {14911},
pages = {288--305},
type = {Konferenz-Beitrag},
month = {August},
year = {2024},
isbn = {978-3-031-68311-4},
issn = {0302-9743},
doi = {10.1007/978-3-031-68312-1_22},
keywords = {Data Product; Virtual Data Lake Zone; Data Stream Adaptation},
language = {Englisch},
cr-category = {H.2.7 Database Administration,
E.2 Data Storage Representations,
H.3.3 Information Search and Retrieval,
H.2.8 Database Applications},
contact = {Senden Sie eine E-Mail an \<christoph.stach@ipvs.uni-stuttgart.de\>.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The emerging paradigm of data products, which has become increasingly popular
recently due to the rise of data meshes and data marketplaces, also poses
unprecedented challenges for data management. Current data architectures,
namely data warehouses and data lakes, are not able to meet these challenges
adequately. In particular, these architectures are not designed for a
just-in-time provision of highly customized data products tailored perfectly to
the needs of customers. In this paper, we therefore present a virtual data lake
zone for composing tailor-made data products on demand, called LALO. LALO uses
data streaming technologies to enable just-in-time composing of data products
without allocating storage space in the data architecture permanently. In order
to enable customers to tailor data products to their needs, LALO uses a novel
mechanism that enables live adaptation of data streams. Evaluation results show
that the overhead for such an adaptation is negligible. Therefore, LALO
represents an efficient solution for the appropriate handling of data products,
both in terms of storage space and runtime.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-06&engl=0}
}
@inproceedings {INPROC-2024-05,
author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch},
title = {{The Data Platform Evolution: From Data Warehouses over Data Lakes to Lakehouses}},
booktitle = {Proceedings of the 34th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Hirsau, Germany},
editor = {Holger Schwarz},
publisher = {CEUR Workshop Proceedings},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
series = {CEUR Workshop Proceedings},
volume = {3714},
pages = {67--71},
type = {Workshop-Beitrag},
month = {Juli},
year = {2024},
issn = {1613-0073},
keywords = {Lakehouse; Data Warehouse; Data Lake; Data Management; Data Analytics},
language = {Englisch},
cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,
H.4.2 Information Systems Applications Types of Systems},
ee = {https://ceur-ws.org/Vol-3714/invited2.pdf},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The continuously increasing availability of data and the growing maturity of
data-driven analysis techniques have encouraged enterprises to collect and
analyze huge amounts of business-relevant data in order to exploit it for
competitive advantages. To facilitate these processes, various platforms for
analytical data management have been developed: While data warehouses have
traditionally been used by business analysts for reporting and OLAP, data lakes
emerged as an alternative concept that also supports advanced analytics. As
these two common types of data platforms show rather contrary characteristics
and target different user groups and analytical approaches, enterprises usually
need to employ both of them, resulting in complex, error-prone and
cost-expensive architectures. To address these issues, efforts have recently
become apparent to combine features of data warehouses and data lakes into
so-called lakehouses, which pursue to serve all kinds of analytics from a
single data platform. This paper provides an overview on the evolution of
analytical data platforms from data warehouses over data lakes to lakehouses
and elaborates on the vision and characteristics of the latter. Furthermore, it
addresses the question of what aspects common data lakes are currently missing
that prevent them from transitioning to lakehouses.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-05&engl=0}
}
@inproceedings {INPROC-2024-04,
author = {Jan Schneider and Arnold Lutsch and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
title = {{First Experiences on the Application of Lakehouses in Industrial Practice}},
booktitle = {Proceedings of the 35th GI-Workshop on Foundations of Databases (Grundlagen von Datenbanken), Herdecke, Germany},
editor = {Uta St{\"o}rl},
publisher = {CEUR Workshop Proceedings},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
series = {CEUR Workshop Proceedings},
volume = {3710},
pages = {3--8},
type = {Workshop-Beitrag},
month = {Juni},
year = {2024},
isbn = {1613-0073},
keywords = {Data Lakehouse; Data Platform; Platform Architecture; Data Analytics; Case Study; Industry Experience},
language = {Englisch},
cr-category = {H.3.4 Information Storage and Retrieval Systems and Software,
H.4.2 Information Systems Applications Types of Systems},
ee = {https://ceur-ws.org/Vol-3710/paper1.pdf},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {In recent years, so-called lakehouses have emerged as a new type of data
platform that intends to combine characteristics of data warehouses and data
lakes. Although companies started to employ the associated concepts and
technologies as part of their analytics architectures, little is known about
their practical medium- and long-term experiences as well as proven
architectural decisions. Additionally, there is only limited knowledge about
how lakehouses can be utilized effectively in an industrial context. Hence, it
remains unclear under which circumstances lakehouses represent a viable
alternative to conventional data platforms. To address this gap, we conducted a
case study on a real-world industrial case, in which manufacturing data needs
to be managed and analytically exploited. Within the scope of this case, a
dedicated analytics department has been testing and leveraging a lakehouse
approach for several months in a productive environment with high data volumes
and various types of analytical workloads. The paper at hand presents the
results of our within-case analyses and focuses on the industrial setting of
the case as well as the architecture of the utilized lakehouse. This way, it
provides preliminary insights on the application of lakehouses in industrial
practice and refers to useful architectural decisions.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-04&engl=0}
}
@inproceedings {INPROC-2024-03,
author = {Andrea Fieschi and Pascal Hirmer and Sachin Agrawal and Christoph Stach and Bernhard Mitschang},
title = {{HySAAD - A Hybrid Selection Approach for Anonymization by Design in the Automotive Domain}},
booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Ashley Song and Cheng Long},
address = {Los Alamitos, Washington, Tokyo},
publisher = {IEEE Computer Society Conference Publishing Services},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {203--210},
type = {Konferenz-Beitrag},
month = {Juni},
year = {2024},
isbn = {979-8-3503-7455-1},
issn = {2375-0324},
doi = {10.1109/MDM61037.2024.00044},
keywords = {anonymization; connected vehicles; privacy protection; metrics},
language = {Englisch},
cr-category = {K.4.1 Computers and Society Public Policy Issues},
contact = {Senden Sie eine E-Mail an \<andrea.fieschi@ipvs.uni-stuttgart.de\>.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The increasing connectivity and data exchange between vehicles and the cloud
have led to growing privacy concerns. To keep on gaining product insights
through data collection while guaranteeing privacy protection, an
anonymization-by-design approach should be used. A rising number of
anonymization methods, not limited to the automotive domain, can be found in
the literature and practice. The developers need support to select the suitable
anonymization technique. To this end, we make the following two contributions:
1) We apply our knowledge from the automotive domain to outline the usage of
qualitative metrics for anonymization techniques assessment; 2) We introduce
HySAAD, a hybrid selection approach for anonymization by design that leverages
this groundwork by recommending appropriate anonymization techniques for each
mobile data analytics use case based on both, qualitative (i.e., {\ss}oft``) metrics
and quantitative (i.e., ''hard``) metrics. Using a real-world use case from the
automotive, we demonstrate the applicability and effectiveness of HySAAD.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-03&engl=0}
}
@inproceedings {INPROC-2024-02,
author = {Yunxuan Li and Christoph Stach and Bernhard Mitschang},
title = {{PaDS: An adaptive and privacy-enabling Data Pipeline for Smart Cars}},
booktitle = {Proceedings of the 25th IEEE International Conference on Mobile Data Management (MDM 2024)},
editor = {Chiara Renso and Mahmoud Sakr and Walid G Aref and Kyoung-Sook Kim and Manos Papagelis and Dimitris Sacharidis},
address = {Los Alamitos, Washington, Tokyo},
publisher = {IEEE Computer Society Conference Publishing Services},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {41--50},
type = {Konferenz-Beitrag},
month = {Juni},
year = {2024},
isbn = {979-8-3503-7455-1},
issn = {2375-0324},
doi = {10.1109/MDM61037.2024.00026},
keywords = {smart car; privacy-enabling data pipeline; datastream runtime adaptation; mobile data privacy management},
language = {Englisch},
cr-category = {K.4.1 Computers and Society Public Policy Issues},
contact = {Senden Sie eine E-Mail an \<yunxuan.li@ipvs.uni-stuttgart.de\>.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The extensive use of onboard sensors in smart cars enables the collection,
processing, and dissemination of large amounts of mobile data containing
information about the vehicle, its driver, and even bystanders. Despite the
undoubted benefits of such smart cars, this leads to significant privacy
concerns. Due to their inherent mobility, the situation of smart cars changes
frequently, and with it, the appropriate measures to counteract the exposure of
private data. However, data management in such vehicles lacks sufficient
support for this privacy dynamism. We therefore introduce PaDS, a framework for
Privacy adaptive Data Stream. The focus of this paper is to enable adaptive
data processing within the vehicle data stream. With PaDS, Privacy-Enhancing
Technologies can be deployed dynamically in the data pipeline of a smart car
according to the current situation without user intervention. With a comparison
of state-of-the-art approaches, we demonstrate that our solution is very
efficient as it does not require a complete restart of the data pipeline.
Moreover, compared to a static approach, PaDS causes only minimal overhead
despite its dynamic adaptation of the data pipeline to react to changing
privacy requirements. This renders PaDS an effective privacy solution for smart
cars.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-02&engl=0}
}
@inproceedings {INPROC-2024-01,
author = {Dennis Przytarski and Christoph Stach and Bernhard Mitschang},
title = {{Assessing Data Layouts to Bring Storage Engine Functionality to Blockchain Technology}},
booktitle = {Proceedings of the 57th Hawaii International Conference on System Sciences (HICSS '24)},
editor = {Tung X. Bui},
publisher = {ScholarSpace},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {5091--5100},
type = {Konferenz-Beitrag},
month = {Januar},
year = {2024},
isbn = {978-0-9981331-7-1},
keywords = {blockchain; storage engine; queries},
language = {Englisch},
cr-category = {H.3.1 Content Analysis and Indexing,
H.3.2 Information Storage,
H.3.3 Information Search and Retrieval},
ee = {https://hdl.handle.net/10125/106995},
contact = {Senden Sie eine E-Mail an \<Christoph.Stach@ipvs.uni-stuttgart.de\>.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Nowdays, modern applications often use blockchains as a secure data store.
However, querying blockchain data is more challenging than querying
conventional databases due to blockchains being primarily designed for the
logging of asset transfers, such as cryptocurrencies, rather than storing and
reading generic data. To improve the experience of querying blockchain data and
make it comparable to querying conventional databases, new design approaches of
the storage engine for blockchain technology are required. An important aspect
is the data layout of a block, as it plays a crucial role in facilitating
reading of blockchain data. In this paper, we identify a suitable data layout
that provides the required query capabilities while preserving the key
properties of blockchain technology. Our goal is to overcome the limitations of
current data access models in blockchains, such as the reliance on auxiliary
data storages and error-prone smart contracts. To this end, we compare four
promising data layouts with data models derived from document, row, column, and
triple stores in terms of schema flexibility, read pattern generality, and
relational algebra suitability. We then assess the most suitable data layout
for blockchain technology.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2024-01&engl=0}
}
@article {ART-2024-01,
author = {Jan Schneider and Christoph Gr{\"o}ger and Arnold Lutsch and Holger Schwarz and Bernhard Mitschang},
title = {{The Lakehouse: State of the Art on Concepts and Technologies}},
journal = {SN Computer Science},
publisher = {Springer Nature},
volume = {5},
number = {5},
pages = {1--39},
type = {Artikel in Zeitschrift},
month = {April},
year = {2024},
issn = {2661-8907},
doi = {10.1007/s42979-024-02737-0},
keywords = {Data Lakehouse; Data Lake; Data Platform; Data Analytics},
language = {Englisch},
cr-category = {H.3.4 Information Storage and Retrieval Systems and Software},
ee = {https://doi.org/10.1007/s42979-024-02737-0,
https://link.springer.com/content/pdf/10.1007/s42979-024-02737-0.pdf},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {In the context of data analytics, so-called lakehouses refer to novel variants
of data platforms that attempt to combine characteristics of data warehouses
and data lakes. In this way, lakehouses promise to simplify enterprise
analytics architectures, which often suffer from high operational costs, slow
analytical processes and further shortcomings resulting from data replication.
However, different views and notions on the lakehouse paradigm exist, which are
commonly driven by individual technologies and varying analytical use cases.
Therefore, it remains unclear what challenges lakehouses address, how they can
be characterized and which technologies can be leveraged to implement them.
This paper addresses these issues by providing an extensive overview of
concepts and technologies that are related to the lakehouse paradigm and by
outlining lakehouses as a distinct architectural approach for data platforms.
Concepts and technologies from literature with regard to lakehouses are
discussed, based on which a conceptual foundation for lakehouses is
established. In addition, several popular technologies are evaluated regarding
their suitability for the building of lakehouses. All findings are supported
and demonstrated with the help of a representative analytics scenario. Typical
challenges of conventional data platforms are identified, a new, sharper
definition for lakehouses is proposed and technical requirements for lakehouses
are derived. As part of an evaluation, these requirements are applied to
several popular technologies, of which frameworks for data lakes turn out to be
particularly helpful for the construction of lakehouses. Our work provides an
overview of the state of the art and a conceptual foundation for the lakehouse
paradigm, which can support future research.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2024-01&engl=0}
}