Publikationen AS: Bibliographie 2021 BibTeX
@inproceedings {INPROC-2021-11,
author = {Christoph Stach and Julia Br{\"a}cker and Rebecca Eichler and Corinna Giebler and Bernhard Mitschang},
title = {{Demand-Driven Data Provisioning in Data Lakes: BARENTS - A Tailorable Data Preparation Zone}},
booktitle = {Proceedings of the 23rd International Conference on Information Integration and Web-based Applications \& Services (iiWAS2021); Linz, Austria, November 29-December 1, 2021},
editor = {Maria Indrawan-Santiago and Eric Pardede and Ivan Luiz Salvadori and Matthias Steinbauer and Ismail Khalil and Gabriele Kotsis},
address = {New York, NY, United States},
publisher = {Association for Computing Machinery (ACM)},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {1--12},
type = {Konferenz-Beitrag},
month = {November},
year = {2021},
isbn = {978-1-4503-9556-4/21/11},
doi = {10.1145/3487664.3487784},
keywords = {data pre-processing; data transformation; knowledge modeling; ontology; data management; Data Lakes; zone model; food analysis},
language = {Englisch},
cr-category = {H.2.7 Database Administration,
E.2 Data Storage Representations,
H.3.3 Information Search and Retrieval,
H.2.8 Database Applications},
contact = {Senden Sie eine E-Mail an christoph.stach@ipvs.uni-stuttgart.de.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Data has never been as significant as it is today. It can be acquired virtually
at will on any subject. Yet, this poses new challenges towards data management,
especially in terms of storage (data is not consumed during processing, i.e.,
the data volume keeps growing), flexibility (new applications emerge), and
operability (analysts are no IT experts). The goal has to be a demand-driven
data provisioning, i.e., the right data must be available in the right form at
the right time. Therefore, we introduce a tailorable data preparation zone for
Data Lakes called BARENTS. It enables users to model in an ontology how to
derive information from data and assign the information to use cases. The data
is automatically processed based on this model and the refined data is made
available to the appropriate use cases. Here, we focus on a resource-efficient
data management strategy. BARENTS can be embedded seamlessly into established
Big Data infrastructures, e.g., Data Lakes.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-11&engl=0}
}
@inproceedings {INPROC-2021-10,
author = {Alejandro Villanueva Zacarias and Christian Weber and Peter Reimann and Bernhard Mitschang},
title = {{AssistML: A Concept to Recommend ML Solutions for Predictive Use Cases}},
booktitle = {Proceedings of the 8th IEEE International Conference on Data Science and Advanced Analytics (DSAA 2021)},
address = {Porto, Portugal},
publisher = {IEEE},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
type = {Konferenz-Beitrag},
month = {Oktober},
year = {2021},
keywords = {Recommender Systems; Machine Learning; Meta Learning},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The adoption of machine learning (ML) in organizations is characterized by the
use of multiple ML software components. Citizen data scientists face practical
requirements when building ML systems, which go beyond the known challenges of
ML, e. g., data engineering or parameter optimization. They are expected to
quickly identify ML system options that strike a suitable trade-off across
multiple performance criteria. These options also need to be understandable for
non-technical users. Addressing these practical requirements represents a
problem for citizen data scientists with limited ML experience. This calls for
a method to help them identify suitable ML software combinations. Related work,
e. g., AutoML systems, are not responsive enough or cannot balance different
performance criteria. In this paper, we introduce AssistML, a novel concept to
recommend ML solutions, i. e., software systems with ML models, for predictive
use cases. AssistML uses metadata of existing ML solutions to quickly identify
and explain options for a new use case. We implement the approach and evaluate
it with two exemplary use cases. Results show that AssistML proposes ML
solutions that are in line with users{\^a}€™ performance preferences in seconds.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-10&engl=0}
}
@inproceedings {INPROC-2021-09,
author = {Eduard Wagner and Bernd Keller and Peter Reimann and Christoph Gr{\"o}ger and Dieter Spath},
title = {{Advanced Analytics for Evaluating Critical Joining Technologies in Automotive Body Structures and Body Shops}},
booktitle = {Proceedings of the 15th CIRP Conference on Intelligent Computation in Manufacturing Engineering (CIRP ICME)},
publisher = {Elsevier},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
type = {Konferenz-Beitrag},
month = {Juli},
year = {2021},
keywords = {Body Shop; Data Analytics; Data Mining; Advanced Analytics; Machine Learning},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The product development process within the automotive industry is subject to
changing demands due to internal and external influences. These influences and
adjustments especially affect the car body and its inherent joining technology,
as critical stages of variant creation. However, current literature does not
offer a suitable analytical method to identify and assess these critical
influences. We propose an advanced analytics approach that combines data mining
and machine learning techniques within the car body substructure. The
evaluation within the MercedesBenz AG shows that our approach facilitates a
quantitative assessment of unknown interdependencies between car body modules
and corresponding joining technique},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-09&engl=0}
}
@inproceedings {INPROC-2021-08,
author = {Alexander Birk and Yannick Wilhelm and Simon Dreher and Christian Flack and Peter Reimann and Christoph Gr{\"o}ger},
title = {{A Real-World Application of Process Mining for Data-Driven Analysis of Multi-Level Interlinked Manufacturing Processes}},
booktitle = {Procedia CIRP: Proceedings of the 54th CIRP Conference on Manufacturing Systems (CIRP CMS 2021)},
publisher = {Elsevier},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
type = {Konferenz-Beitrag},
month = {September},
year = {2021},
keywords = {Process Mining; Multi-level Interlinked Manufacturing Process; Heterogeneous Data Sources; Data Integration},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Process Mining (PM) has huge potential for manufacturing process analysis.
However, there is little research on practical applications. We investigate a
real-world manufacturing process of pneumatic valves. The manufacturing process
comprises interlinked events at the superordinate business process level and at
the subordinate machine level, making its analysis based on PM challenging.We
show how to integrate heterogeneous data sources and give examples how PM
enables a deeper understanding of the manufacturing process, thereby helping to
uncover optimization potentials. Furthermore, we discuss challenges in data
integration and point out limitations of current PM techniques in
manufacturing.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-08&engl=0}
}
@inproceedings {INPROC-2021-07,
author = {Julian Ziegler and Peter Reimann and Florian Keller and Bernhard Mitschang},
title = {{A Metadata Model to Connect Isolated Data Silos and Activities of the CAE Domain}},
booktitle = {Proceedings of the 33rd International Conference on Advanced Information Systems Engineering (CAiSE)},
publisher = {Springer International Publishing},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {213--228},
type = {Konferenz-Beitrag},
month = {Juni},
year = {2021},
keywords = {Metadata Models; Graphs; Computer-aided Engineering},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Computer-aided engineering (CAE) applications support the digital
transformation of the manufacturing industry. They facilitate virtual product
development and product testing via computer simulations. CAE applications
generate vast quantities of heterogeneous data. Domain experts struggle to
access and analyze them, because such engineering data are not sufficiently
described with metadata. In this paper, we characterize the CAE domain and
identify unsolved challenges for a tailored data and metadata management. For
instance, work activities in product development projects and their
relationships to data are not represented explicitly in current metadata
models. We propose a metadata model that addresses all challenges and provides
a connected view on all CAE data, metadata, and work activities of development
projects. We validate the feasibility of our metadata model through a
prototypical implementation and its application to a real-world use case. This
verifies that our metadata model addresses the CAE-specific challenges and this
way eases the task of domain experts to exploit relevant data.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-07&engl=0}
}
@inproceedings {INPROC-2021-06,
author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Holger Schwarz and Bernhard Mitschang},
title = {{Enterprise-Wide Metadata Management - An Industry Case on the Current State and Challenges}},
booktitle = {24thInternational Conference on Business Information Systems},
editor = {Witold Abramowicz and S{\"o}ren Auer and Lewa\&\#324 and El\&\#380 Ska and Bieta},
publisher = {TIB Open Publishing},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {269--279},
type = {Konferenz-Beitrag},
month = {Juli},
year = {2021},
doi = {https://doi.org/10.52825/bis.v1i.47},
language = {Englisch},
cr-category = {A.0 General Literature, General},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Abstract. Metadata management is a crucial success factor for companies today,
as for example, it enables exploiting data value fully or enables legal
compliance. With the emergence of new concepts, such as the data lake, and new
objectives, such as the enterprise-wide sharing of data, metadata management
has evolved and now poses a renewed challenge for companies. In this context,
we interviewed a globally active manufacturer to reveal how metadata management
is implemented in practice today and what challenges companies are faced with
and whether these constitute research gaps. As an outcome, we present the
company{\^a}€™s metadata management goals and their corresponding solution
approaches and challenges. An evaluation of the challenges through a literature
and tool review yields three research gaps, which are concerned with the
topics: (1) metadata management for data lakes, (2) categorizations and
compositions of metadata management tools for comprehensive metadata
management, and (3) the use of data marketplaces as metadata-driven exchange
platforms within an enterprise. The gaps lay the groundwork for further
research activities in the field of metadata management and the industry case
represents a starting point for research to realign with real-world industry
needs.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-06&engl=0}
}
@inproceedings {INPROC-2021-05,
author = {Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Rebecca Eichler and Holger Schwarz and Bernhard Mitschang},
title = {{The Data Lake Architecture Framework}},
booktitle = {Datenbanksysteme f{\"u}r Business, Technologie und Web (BTW 2021), 19. Fachtagung des GI-Fachbereichs Datenbanken und Informationssysteme (DBIS), 13.-17. September 2021, Dresden, Germany},
publisher = {Gesellschaft f{\"u}r Informatik},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {351--370},
type = {Konferenz-Beitrag},
month = {September},
year = {2021},
doi = {10.18420/btw2021-19},
language = {Englisch},
cr-category = {H.4 Information Systems Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {During recent years, data lakes emerged as a way to manage large amounts of
heterogeneous data for modern data analytics. Although various work on
individual aspects of data lakes exists, there is no comprehensive data lake
architecture yet. Concepts that describe themselves as a {\^a}€œdata lake
architecture{\^a}€ are only partial. In this work, we introduce the data lake
architecture framework. It supports the definition of data lake architectures
by defining nine architectural aspects, i.e., perspectives on a data lake, such
as data storage or data modeling, and by exploring the interdependencies
between these aspects. The included methodology helps to choose appropriate
concepts to instantiate each aspect. To evaluate the framework, we use it to
configure an exemplary data lake architecture for a real-world data lake
implementation. This final assessment shows that our framework provides
comprehensive guidance in the configuration of a data lake architecture.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-05&engl=0}
}
@inproceedings {INPROC-2021-04,
author = {Manuel Fritz and Gang Shao and Holger Schwarz},
title = {{Automatic Selection of Analytic Platforms with ASAP-DM}},
booktitle = {Proceedings of the 33rd International Conference on Scientific and Statistical Database Management},
publisher = {ACM},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {220--225},
type = {Konferenz-Beitrag},
month = {Juli},
year = {2021},
isbn = {9781450384131},
doi = {10.1145/3468791.3468802},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {The plethora of available analytic platforms escalates the difficulty of
selecting the most appropriate platform for a certain data mining task and
datasets with varying characteristics. Especially novice analysts experience
difficulties to keep up with the latest technical developments. In this demo,
we present the ASAP-DM framework. ASAP-DM is able to automatically select a
well-performing analytic platform for a given data mining task via an intuitive
web interface, thus especially supporting novice analysts. The take-aways for
demo attendees are: (1) a good understanding of the challenges of various data
mining workloads, dataset characteristics, and the effects on the selection of
analytic platforms, (2) useful insights on how ASAP-DM internally works, and
(3) how to benefit from ASAP-DM for exploratory data analysis.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-04&engl=0}
}
@inproceedings {INPROC-2021-03,
author = {Dennis Tschechlov and Manuel Fritz and Holger Schwarz},
title = {{AutoML4Clust: Efficient AutoML for Clustering Analyses}},
booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
publisher = {Online},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {1--6},
type = {Konferenz-Beitrag},
month = {M{\"a}rz},
year = {2021},
doi = {10.5441/002/EDBT.2021.32},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Data analysis is a highly iterative process. In order to achieve valuable
analysis results, analysts typically execute many configurations, i.e.,
algorithms and their hyperparameter settings, based on their domain knowledge.
While experienced analysts may be able to define small search spaces for
promising configurations, especially novice analysts define large search spaces
due to their lack of domain knowledge. In the worst case, they perform an
exhaustive search throughout the whole search space, resulting in infeasible
runtimes. Recent advances in the research area of AutoML address this challenge
by supporting novice analysts in the combined algorithm selection and
hyperparameter optimization (CASH) problem for supervised learning tasks.
However, no such systems exist for unsupervised learning tasks, such as the
prevalent task of clustering analysis. In this work, we present our novel
AutoML4Clust approach, which efficiently supports novice analysts regarding
CASH for clustering analyses. To the best of our knowledge, this is the first
thoroughly elaborated approach in this area. Our comprehensive evaluation
unveils that AutoML4Clust significantly outperforms several existing
approaches, as it achieves considerable speedups for the CASH problem, while
still achieving very valuable clustering results.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-03&engl=0}
}
@inproceedings {INPROC-2021-02,
author = {Manuel Fritz and Dennis Tschechlov and Holger Schwarz},
title = {{Efficient Exploratory Clustering Analyses with Qualitative Approximations}},
booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
publisher = {Online},
institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
pages = {1--6},
type = {Konferenz-Beitrag},
month = {M{\"a}rz},
year = {2021},
doi = {10.5441/002/EDBT.2021.31},
language = {Englisch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Clustering is a fundamental primitive for exploratory data analyses. Yet,
finding valuable clustering results for previously unseen datasets is a pivotal
challenge. Analysts as well as automated exploration methods often perform an
exploratory clustering analysis, i.e., they repeatedly execute a clustering
algorithm with varying parameters until valuable results can be found. k-center
clustering algorithms, such as k-Means, are commonly used in such exploratory
processes. However, in the worst case, each single execution of k-Means
requires a super-polynomial runtime, making the overall exploratory process on
voluminous datasets infeasible in a reasonable time frame. We propose a novel
and efficient approach for approximating results of k-center clustering
algorithms, thus supporting analysts in an ad-hoc exploratory process for
valuable clustering results. Our evaluation on an Apache Spark cluster unveils
that our approach significantly outperforms the regular execution of a k-center
clustering algorithm by several orders of magnitude in runtime with a
predefinable qualitative demand. Hence, our approach is a strong fit for
clustering voluminous datasets in exploratory settings.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-02&engl=0}
}
@article {ART-2021-05,
author = {Manuel Fritz and Michael Behringer and Dennis Tschechlov and Holger Schwarz},
title = {{Efficient exploratory clustering analyses in large-scale exploration processes}},
journal = {The VLDB Journal},
editor = {Georgia Koutrika and Ren{\'e}e J. Miller and Kyuseok Shim},
address = {Berlin, Heidelberg},
publisher = {Springer Berlin Heidelberg},
pages = {1--22},
type = {Artikel in Zeitschrift},
month = {November},
year = {2021},
doi = {10.1007/s00778-021-00716-y},
issn = {1066-8888},
keywords = {Exploratory clustering analysis; Exploration; Clustering; Centroid-based clustering},
language = {Deutsch},
cr-category = {H.3.3 Information Search and Retrieval},
contact = {Senden Sie eine E-Mail an manuel.fritz@ipvs.uni-stuttgart.de.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Clustering is a fundamental primitive in manifold applications. In order to
achieve valuable results in exploratory clustering analyses, parameters of the
clustering algorithm have to be set appropriately, which is a tremendous
pitfall. We observe multiple challenges for large-scale exploration processes.
On the one hand, they require specific methods to efficiently explore large
parameter search spaces. On the other hand, they often exhibit large runtimes,
in particular when large datasets are analyzed using clustering algorithms with
super-polynomial runtimes, which repeatedly need to be executed within
exploratory clustering analyses. We address these challenges as follows: First,
we present LOG-Means and show that it provides estimates for the number of
clusters in sublinear time regarding the defined search space, i.e., provably
requiring less executions of a clustering algorithm than existing methods.
Second, we demonstrate how to exploit fundamental characteristics of
exploratory clustering analyses in order to significantly accelerate the
(repetitive) execution of clustering algorithms on large datasets. Third, we
show how these challenges can be tackled at the same time. To the best of our
knowledge, this is the first work which simultaneously addresses the
above-mentioned challenges. In our comprehensive evaluation, we unveil that our
proposed methods significantly outperform state-of-the-art methods, thus
especially supporting novice analysts for exploratory clustering analyses in
large-scale exploration processes.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-05&engl=0}
}
@article {ART-2021-04,
author = {Dennis Przytarski and Christoph Stach and Cl{\'e}mentine Gritti and Bernhard Mitschang},
title = {{Query Processing in Blockchain Systems: Current State and Future Challenges}},
journal = {Future Internet},
editor = {Dino Giuli and Andrew Hudson-Smith and Luis Javier Garcia Villalba},
publisher = {MDPI},
volume = {14},
number = {1},
pages = {1--31},
type = {Artikel in Zeitschrift},
month = {Dezember},
year = {2021},
issn = {1999-5903},
doi = {10.3390/fi14010001},
keywords = {blockchain systems; query processing; data models; data structures; block structures},
language = {Englisch},
cr-category = {H.3.0 Information Storage and Retrieval General,
H.3.3 Information Search and Retrieval},
contact = {Senden Sie eine E-Mail an Dennis.Przytarski@ipvs.uni-stuttgart.de.},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {When, in 2008, Satoshi Nakamoto envisioned the first distributed database
management system that relied on cryptographically secured chain of blocks to
store data in an immutable and tamper-resistant manner, his primary use case
was the introduction of a digital currency. Owing to this use case, the
blockchain system was geared towards efficient storage of data, whereas the
processing of complex queries, such as provenance analyses of data history, is
out of focus. The increasing use of Internet of Things technologies and the
resulting digitization in many domains, however, have led to a plethora of
novel use cases for a secure digital ledger. For instance, in the healthcare
sector, blockchain systems are used for the secure storage and sharing of
electronic health records, while the food industry applies such systems to
enable a reliable food-chain traceability, e.g., to prove compliance with cold
chains. In these application domains, however, querying the current state is
not sufficient - comprehensive history queries are required instead. Due to
these altered usage modes involving more complex query types, it is
questionable whether today's blockchain systems are prepared for this type of
usage and whether such queries can be processed efficiently by them. In our
paper, we therefore investigate novel use cases for blockchain systems and
elicit their requirements towards a data store in terms of query capabilities.
We reflect the state of the art in terms of query support in blockchain systems
and assess whether it is capable of meeting the requirements of such more
sophisticated use cases. As a result, we identify future research challenges
with regard to query processing in blockchain systems.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-04&engl=0}
}
@article {ART-2021-03,
author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
title = {{Modeling metadata in data lakes—A generic model}},
journal = {Data \& Knowledge Engineering},
publisher = {Elsevier},
volume = {136},
pages = {1--17},
type = {Artikel in Zeitschrift},
month = {November},
year = {2021},
issn = {0169-023X},
doi = {10.1016/j.datak.2021.101931},
keywords = {Metadata management; Metadata model; Data lake; Data management; Data lake zones; Metadata classification},
language = {Englisch},
cr-category = {H.2 Database Management},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Data contains important knowledge and has the potential to provide new
insights. Due to new technological developments such as the Internet of Things,
data is generated in increasing volumes. In order to deal with these data
volumes and extract the data{\^a}{\^a}‚¬{\^a}„¢s value new concepts such as the data lake
were created. The data lake is a data management platform designed to handle
data at scale for analytical purposes. To prevent a data lake from becoming
inoperable and turning into a data swamp, metadata management is needed. To
store and handle metadata, a generic metadata model is required that can
reflect metadata of any potential metadata management use case, e.g., data
versioning or data lineage. However, an evaluation of existent metadata models
yields that none so far are sufficiently generic as their design basis is not
suited. In this work, we use a different design approach to build HANDLE, a
generic metadata model for data lakes. The new metadata model supports the
acquisition of metadata on varying granular levels, any metadata
categorization, including the acquisition of both metadata that belongs to a
specific data element as well as metadata that applies to a broader range of
data. HANDLE supports the flexible integration of metadata and can reflect the
same metadata in various ways according to the intended utilization.
Furthermore, it is created for data lakes and therefore also supports data lake
characteristics like data lake zones. With these capabilities HANDLE enables
comprehensive metadata management in data lakes. HANDLE{\^a}{\^a}‚¬{\^a}„¢s feasibility is
shown through the application to an exemplary access-use-case and a
prototypical implementation. By comparing HANDLE with existing models we
demonstrate that it can provide the same information as the other models as
well as adding further capabilities needed for metadata management in data
lakes.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-03&engl=0}
}
@inbook {INBOOK-2021-01,
author = {Dimitri Petrik and Mathias Mormul and Peter Reimann and Christoph Gr{\"o}ger},
title = {{Anforderungen f{\"u}r Zeitreihendatenbanken im industriellen IoT}},
series = {IoT – Best Practices},
publisher = {Springer-Verlag},
pages = {339--377},
type = {Beitrag in Buch},
month = {Mai},
year = {2021},
keywords = {Zeitreihendaten; Zeitreihendatenbanken; Industrial IoT; Edge Computing; Data Lake; InfluxDB},
language = {Deutsch},
cr-category = {H.2.8 Database Applications},
department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
abstract = {Das industrielle Internet der Dinge (IIoT) integriert Informations- und
Kommunikationstechnologien in industrielle Prozesse und erweitert sie durch
Echtzeit-Datenanalyse. Hierbei sind sensorbasierte Zeitreihen ein wesentlicher
Typ von Daten, die in der industriellen Fertigung generiert werden.
Sensorbasierte Zeitreihendaten werden in regelm{\"a}{\ss}igen Abst{\"a}nden generiert
und enthalten zus{\"a}tzlich zum Sensorwert einen Zeitstempel. Spezielle
Zeitreihen-Datenbanken (eng.: Time Series Databases (TSDB)) sind daf{\"u}r
ausgelegt, Zeitreihendaten effizient zu speichern. Wenn TSDBs maschinennah, d.
h. in der industriellen Edge, eingesetzt werden, sind Maschinendaten zur
{\"U}berwachung zeitkritischer Prozesse aufgrund der niedrigen Latenz schnell
verf{\"u}gbar, was die erforderliche Zeit f{\"u}r die Datenverarbeitung reduziert.
Andererseits k{\"o}nnen TSDBs auch in den Data Lakes als skalierbaren
Datenplattformen zur Speicherung und Analyse von Rohdaten zum Einsatz kommen,
um die langfristige Vorhaltung von Zeitreihendaten zu erm{\"o}glichen. Bisherige
Untersuchungen zu TSDBs sind bei der Auswahl f{\"u}r den Einsatz in der
industriellen Edge und im Data Lake nicht vorhanden. Die meisten verf{\"u}gbaren
Benchmarks von TSDBs sind performanceorientiert und ber{\"u}cksichtigen nicht die
Randbedingungen einer industriellen Edge oder eines Data Lake. Wir adressieren
diese L{\"u}cke und identifizieren funktionale Kriterien f{\"u}r den Einsatz von
TSDBs in diesen beiden Umgebungen und bilden somit einen qualitativen
Kriterienkatalog. Des Weiteren zeigen wir am Beispiel von InfluxDB, wie dieser
Katalog verwendet werden kann, mit dem Ziel die systematische Auswahl einer
passenden TSDB f{\"u}r den Einsatz in der Edge und im Data Lake zu unterst{\"u}tzen.},
url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INBOOK-2021-01&engl=0}
}