Institut für Parallele und Verteilte Systeme (IPVS)

Publikationen

Eine Übersicht der Publikationen des Instituts für Parallele und Verteilte Systeme

Publikationen AS: Bibliographie 2021 BibTeX

 
@inproceedings {INPROC-2021-11,
   author = {Christoph Stach and Julia Br{\"a}cker and Rebecca Eichler and Corinna Giebler and Bernhard Mitschang},
   title = {{Demand-Driven Data Provisioning in Data Lakes: BARENTS - A Tailorable Data Preparation Zone}},
   booktitle = {Proceedings of the 23rd International Conference on Information Integration and Web-based Applications \& Services (iiWAS2021); Linz, Austria, November 29-December 1, 2021},
   editor = {Maria Indrawan-Santiago and Eric Pardede and Ivan Luiz Salvadori and Matthias Steinbauer and Ismail Khalil and Gabriele Kotsis},
   address = {New York, NY, United States},
   publisher = {Association for Computing Machinery (ACM)},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {1--12},
   type = {Konferenz-Beitrag},
   month = {November},
   year = {2021},
   isbn = {978-1-4503-9556-4/21/11},
   doi = {10.1145/3487664.3487784},
   keywords = {data pre-processing; data transformation; knowledge modeling; ontology; data management; Data Lakes; zone model; food analysis},
   language = {Englisch},
   cr-category = {H.2.7 Database Administration,     E.2 Data Storage Representations,     H.3.3 Information Search and Retrieval,     H.2.8 Database Applications},
   contact = {Senden Sie eine E-Mail an christoph.stach@ipvs.uni-stuttgart.de.},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Data has never been as significant as it is today. It can be acquired virtually at will on any subject. Yet, this poses new challenges towards data management, especially in terms of storage (data is not consumed during processing, i.e., the data volume keeps growing), flexibility (new applications emerge), and operability (analysts are no IT experts). The goal has to be a demand-driven data provisioning, i.e., the right data must be available in the right form at the right time. Therefore, we introduce a tailorable data preparation zone for Data Lakes called BARENTS. It enables users to model in an ontology how to derive information from data and assign the information to use cases. The data is automatically processed based on this model and the refined data is made available to the appropriate use cases. Here, we focus on a resource-efficient data management strategy. BARENTS can be embedded seamlessly into established Big Data infrastructures, e.g., Data Lakes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-11&engl=0}
}
@inproceedings {INPROC-2021-10,
   author = {Alejandro Villanueva Zacarias and Christian Weber and Peter Reimann and Bernhard Mitschang},
   title = {{AssistML: A Concept to Recommend ML Solutions for Predictive Use Cases}},
   booktitle = {Proceedings of the 8th IEEE International Conference on Data Science and Advanced Analytics (DSAA 2021)},
   address = {Porto, Portugal},
   publisher = {IEEE},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   type = {Konferenz-Beitrag},
   month = {Oktober},
   year = {2021},
   keywords = {Recommender Systems; Machine Learning; Meta Learning},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {The adoption of machine learning (ML) in organizations is characterized by the use of multiple ML software components. Citizen data scientists face practical requirements when building ML systems, which go beyond the known challenges of ML, e. g., data engineering or parameter optimization. They are expected to quickly identify ML system options that strike a suitable trade-off across multiple performance criteria. These options also need to be understandable for non-technical users. Addressing these practical requirements represents a problem for citizen data scientists with limited ML experience. This calls for a method to help them identify suitable ML software combinations. Related work, e. g., AutoML systems, are not responsive enough or cannot balance different performance criteria. In this paper, we introduce AssistML, a novel concept to recommend ML solutions, i. e., software systems with ML models, for predictive use cases. AssistML uses metadata of existing ML solutions to quickly identify and explain options for a new use case. We implement the approach and evaluate it with two exemplary use cases. Results show that AssistML proposes ML solutions that are in line with users{\^a}€™ performance preferences in seconds.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-10&engl=0}
}
@inproceedings {INPROC-2021-09,
   author = {Eduard Wagner and Bernd Keller and Peter Reimann and Christoph Gr{\"o}ger and Dieter Spath},
   title = {{Advanced Analytics for Evaluating Critical Joining Technologies in Automotive Body Structures and Body Shops}},
   booktitle = {Proceedings of the 15th CIRP Conference on Intelligent Computation in Manufacturing Engineering (CIRP ICME)},
   publisher = {Elsevier},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   type = {Konferenz-Beitrag},
   month = {Juli},
   year = {2021},
   keywords = {Body Shop; Data Analytics; Data Mining; Advanced Analytics; Machine Learning},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {The product development process within the automotive industry is subject to changing demands due to internal and external influences. These influences and adjustments especially affect the car body and its inherent joining technology, as critical stages of variant creation. However, current literature does not offer a suitable analytical method to identify and assess these critical influences. We propose an advanced analytics approach that combines data mining and machine learning techniques within the car body substructure. The evaluation within the MercedesBenz AG shows that our approach facilitates a quantitative assessment of unknown interdependencies between car body modules and corresponding joining technique},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-09&engl=0}
}
@inproceedings {INPROC-2021-08,
   author = {Alexander Birk and Yannick Wilhelm and Simon Dreher and Christian Flack and Peter Reimann and Christoph Gr{\"o}ger},
   title = {{A Real-World Application of Process Mining for Data-Driven Analysis of Multi-Level Interlinked Manufacturing Processes}},
   booktitle = {Procedia CIRP: Proceedings of the 54th CIRP Conference on Manufacturing Systems (CIRP CMS 2021)},
   publisher = {Elsevier},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   type = {Konferenz-Beitrag},
   month = {September},
   year = {2021},
   keywords = {Process Mining; Multi-level Interlinked Manufacturing Process; Heterogeneous Data Sources; Data Integration},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Process Mining (PM) has huge potential for manufacturing process analysis. However, there is little research on practical applications. We investigate a real-world manufacturing process of pneumatic valves. The manufacturing process comprises interlinked events at the superordinate business process level and at the subordinate machine level, making its analysis based on PM challenging.We show how to integrate heterogeneous data sources and give examples how PM enables a deeper understanding of the manufacturing process, thereby helping to uncover optimization potentials. Furthermore, we discuss challenges in data integration and point out limitations of current PM techniques in manufacturing.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-08&engl=0}
}
@inproceedings {INPROC-2021-07,
   author = {Julian Ziegler and Peter Reimann and Florian Keller and Bernhard Mitschang},
   title = {{A Metadata Model to Connect Isolated Data Silos and Activities of the CAE Domain}},
   booktitle = {Proceedings of the 33rd International Conference on Advanced Information Systems Engineering (CAiSE)},
   publisher = {Springer International Publishing},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {213--228},
   type = {Konferenz-Beitrag},
   month = {Juni},
   year = {2021},
   keywords = {Metadata Models; Graphs; Computer-aided Engineering},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Computer-aided engineering (CAE) applications support the digital transformation of the manufacturing industry. They facilitate virtual product development and product testing via computer simulations. CAE applications generate vast quantities of heterogeneous data. Domain experts struggle to access and analyze them, because such engineering data are not sufficiently described with metadata. In this paper, we characterize the CAE domain and identify unsolved challenges for a tailored data and metadata management. For instance, work activities in product development projects and their relationships to data are not represented explicitly in current metadata models. We propose a metadata model that addresses all challenges and provides a connected view on all CAE data, metadata, and work activities of development projects. We validate the feasibility of our metadata model through a prototypical implementation and its application to a real-world use case. This verifies that our metadata model addresses the CAE-specific challenges and this way eases the task of domain experts to exploit relevant data.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-07&engl=0}
}
@inproceedings {INPROC-2021-06,
   author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Holger Schwarz and Bernhard Mitschang},
   title = {{Enterprise-Wide Metadata Management - An Industry Case on the Current State and Challenges}},
   booktitle = {24thInternational Conference on Business Information Systems},
   editor = {Witold Abramowicz and S{\"o}ren Auer and Lewa\&\#324 and El\&\#380 Ska and Bieta},
   publisher = {TIB Open Publishing},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {269--279},
   type = {Konferenz-Beitrag},
   month = {Juli},
   year = {2021},
   doi = {https://doi.org/10.52825/bis.v1i.47},
   language = {Englisch},
   cr-category = {A.0 General Literature, General},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Abstract. Metadata management is a crucial success factor for companies today, as for example, it enables exploiting data value fully or enables legal compliance. With the emergence of new concepts, such as the data lake, and new objectives, such as the enterprise-wide sharing of data, metadata management has evolved and now poses a renewed challenge for companies. In this context, we interviewed a globally active manufacturer to reveal how metadata management is implemented in practice today and what challenges companies are faced with and whether these constitute research gaps. As an outcome, we present the company{\^a}€™s metadata management goals and their corresponding solution approaches and challenges. An evaluation of the challenges through a literature and tool review yields three research gaps, which are concerned with the topics: (1) metadata management for data lakes, (2) categorizations and compositions of metadata management tools for comprehensive metadata management, and (3) the use of data marketplaces as metadata-driven exchange platforms within an enterprise. The gaps lay the groundwork for further research activities in the field of metadata management and the industry case represents a starting point for research to realign with real-world industry needs.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-06&engl=0}
}
@inproceedings {INPROC-2021-05,
   author = {Corinna Giebler and Christoph Gr{\"o}ger and Eva Hoos and Rebecca Eichler and Holger Schwarz and Bernhard Mitschang},
   title = {{The Data Lake Architecture Framework}},
   booktitle = {Datenbanksysteme f{\"u}r Business, Technologie und Web (BTW 2021), 19. Fachtagung des GI-Fachbereichs Datenbanken und Informationssysteme (DBIS), 13.-17. September 2021, Dresden, Germany},
   publisher = {Gesellschaft f{\"u}r Informatik},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {351--370},
   type = {Konferenz-Beitrag},
   month = {September},
   year = {2021},
   doi = {10.18420/btw2021-19},
   language = {Englisch},
   cr-category = {H.4 Information Systems Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {During recent years, data lakes emerged as a way to manage large amounts of heterogeneous data for modern data analytics. Although various work on individual aspects of data lakes exists, there is no comprehensive data lake architecture yet. Concepts that describe themselves as a {\^a}€œdata lake architecture{\^a}€ are only partial. In this work, we introduce the data lake architecture framework. It supports the definition of data lake architectures by defining nine architectural aspects, i.e., perspectives on a data lake, such as data storage or data modeling, and by exploring the interdependencies between these aspects. The included methodology helps to choose appropriate concepts to instantiate each aspect. To evaluate the framework, we use it to configure an exemplary data lake architecture for a real-world data lake implementation. This final assessment shows that our framework provides comprehensive guidance in the configuration of a data lake architecture.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-05&engl=0}
}
@inproceedings {INPROC-2021-04,
   author = {Manuel Fritz and Gang Shao and Holger Schwarz},
   title = {{Automatic Selection of Analytic Platforms with ASAP-DM}},
   booktitle = {Proceedings of the 33rd International Conference on Scientific and Statistical Database Management},
   publisher = {ACM},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {220--225},
   type = {Konferenz-Beitrag},
   month = {Juli},
   year = {2021},
   isbn = {9781450384131},
   doi = {10.1145/3468791.3468802},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {The plethora of available analytic platforms escalates the difficulty of selecting the most appropriate platform for a certain data mining task and datasets with varying characteristics. Especially novice analysts experience difficulties to keep up with the latest technical developments. In this demo, we present the ASAP-DM framework. ASAP-DM is able to automatically select a well-performing analytic platform for a given data mining task via an intuitive web interface, thus especially supporting novice analysts. The take-aways for demo attendees are: (1) a good understanding of the challenges of various data mining workloads, dataset characteristics, and the effects on the selection of analytic platforms, (2) useful insights on how ASAP-DM internally works, and (3) how to benefit from ASAP-DM for exploratory data analysis.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-04&engl=0}
}
@inproceedings {INPROC-2021-03,
   author = {Dennis Tschechlov and Manuel Fritz and Holger Schwarz},
   title = {{AutoML4Clust: Efficient AutoML for Clustering Analyses}},
   booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
   publisher = {Online},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {1--6},
   type = {Konferenz-Beitrag},
   month = {M{\"a}rz},
   year = {2021},
   doi = {10.5441/002/EDBT.2021.32},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Data analysis is a highly iterative process. In order to achieve valuable analysis results, analysts typically execute many configurations, i.e., algorithms and their hyperparameter settings, based on their domain knowledge. While experienced analysts may be able to define small search spaces for promising configurations, especially novice analysts define large search spaces due to their lack of domain knowledge. In the worst case, they perform an exhaustive search throughout the whole search space, resulting in infeasible runtimes. Recent advances in the research area of AutoML address this challenge by supporting novice analysts in the combined algorithm selection and hyperparameter optimization (CASH) problem for supervised learning tasks. However, no such systems exist for unsupervised learning tasks, such as the prevalent task of clustering analysis. In this work, we present our novel AutoML4Clust approach, which efficiently supports novice analysts regarding CASH for clustering analyses. To the best of our knowledge, this is the first thoroughly elaborated approach in this area. Our comprehensive evaluation unveils that AutoML4Clust significantly outperforms several existing approaches, as it achieves considerable speedups for the CASH problem, while still achieving very valuable clustering results.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-03&engl=0}
}
@inproceedings {INPROC-2021-02,
   author = {Manuel Fritz and Dennis Tschechlov and Holger Schwarz},
   title = {{Efficient Exploratory Clustering Analyses with Qualitative Approximations}},
   booktitle = {Proceedings of the 24th International Conference on Extending Database Technology (EDBT)},
   publisher = {Online},
   institution = {Universit{\"a}t Stuttgart, Fakult{\"a}t Informatik, Elektrotechnik und Informationstechnik, Germany},
   pages = {1--6},
   type = {Konferenz-Beitrag},
   month = {M{\"a}rz},
   year = {2021},
   doi = {10.5441/002/EDBT.2021.31},
   language = {Englisch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Clustering is a fundamental primitive for exploratory data analyses. Yet, finding valuable clustering results for previously unseen datasets is a pivotal challenge. Analysts as well as automated exploration methods often perform an exploratory clustering analysis, i.e., they repeatedly execute a clustering algorithm with varying parameters until valuable results can be found. k-center clustering algorithms, such as k-Means, are commonly used in such exploratory processes. However, in the worst case, each single execution of k-Means requires a super-polynomial runtime, making the overall exploratory process on voluminous datasets infeasible in a reasonable time frame. We propose a novel and efficient approach for approximating results of k-center clustering algorithms, thus supporting analysts in an ad-hoc exploratory process for valuable clustering results. Our evaluation on an Apache Spark cluster unveils that our approach significantly outperforms the regular execution of a k-center clustering algorithm by several orders of magnitude in runtime with a predefinable qualitative demand. Hence, our approach is a strong fit for clustering voluminous datasets in exploratory settings.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INPROC-2021-02&engl=0}
}
@article {ART-2021-05,
   author = {Manuel Fritz and Michael Behringer and Dennis Tschechlov and Holger Schwarz},
   title = {{Efficient exploratory clustering analyses in large-scale exploration processes}},
   journal = {The VLDB Journal},
   editor = {Georgia Koutrika and Ren{\'e}e J. Miller and Kyuseok Shim},
   address = {Berlin, Heidelberg},
   publisher = {Springer Berlin Heidelberg},
   pages = {1--22},
   type = {Artikel in Zeitschrift},
   month = {November},
   year = {2021},
   doi = {10.1007/s00778-021-00716-y},
   issn = {1066-8888},
   keywords = {Exploratory clustering analysis; Exploration; Clustering; Centroid-based clustering},
   language = {Deutsch},
   cr-category = {H.3.3 Information Search and Retrieval},
   contact = {Senden Sie eine E-Mail an manuel.fritz@ipvs.uni-stuttgart.de.},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Clustering is a fundamental primitive in manifold applications. In order to achieve valuable results in exploratory clustering analyses, parameters of the clustering algorithm have to be set appropriately, which is a tremendous pitfall. We observe multiple challenges for large-scale exploration processes. On the one hand, they require specific methods to efficiently explore large parameter search spaces. On the other hand, they often exhibit large runtimes, in particular when large datasets are analyzed using clustering algorithms with super-polynomial runtimes, which repeatedly need to be executed within exploratory clustering analyses. We address these challenges as follows: First, we present LOG-Means and show that it provides estimates for the number of clusters in sublinear time regarding the defined search space, i.e., provably requiring less executions of a clustering algorithm than existing methods. Second, we demonstrate how to exploit fundamental characteristics of exploratory clustering analyses in order to significantly accelerate the (repetitive) execution of clustering algorithms on large datasets. Third, we show how these challenges can be tackled at the same time. To the best of our knowledge, this is the first work which simultaneously addresses the above-mentioned challenges. In our comprehensive evaluation, we unveil that our proposed methods significantly outperform state-of-the-art methods, thus especially supporting novice analysts for exploratory clustering analyses in large-scale exploration processes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-05&engl=0}
}
@article {ART-2021-04,
   author = {Dennis Przytarski and Christoph Stach and Cl{\'e}mentine Gritti and Bernhard Mitschang},
   title = {{Query Processing in Blockchain Systems: Current State and Future Challenges}},
   journal = {Future Internet},
   editor = {Dino Giuli and Andrew Hudson-Smith and Luis Javier Garcia Villalba},
   publisher = {MDPI},
   volume = {14},
   number = {1},
   pages = {1--31},
   type = {Artikel in Zeitschrift},
   month = {Dezember},
   year = {2021},
   issn = {1999-5903},
   doi = {10.3390/fi14010001},
   keywords = {blockchain systems; query processing; data models; data structures; block structures},
   language = {Englisch},
   cr-category = {H.3.0 Information Storage and Retrieval General,     H.3.3 Information Search and Retrieval},
   contact = {Senden Sie eine E-Mail an Dennis.Przytarski@ipvs.uni-stuttgart.de.},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {When, in 2008, Satoshi Nakamoto envisioned the first distributed database management system that relied on cryptographically secured chain of blocks to store data in an immutable and tamper-resistant manner, his primary use case was the introduction of a digital currency. Owing to this use case, the blockchain system was geared towards efficient storage of data, whereas the processing of complex queries, such as provenance analyses of data history, is out of focus. The increasing use of Internet of Things technologies and the resulting digitization in many domains, however, have led to a plethora of novel use cases for a secure digital ledger. For instance, in the healthcare sector, blockchain systems are used for the secure storage and sharing of electronic health records, while the food industry applies such systems to enable a reliable food-chain traceability, e.g., to prove compliance with cold chains. In these application domains, however, querying the current state is not sufficient - comprehensive history queries are required instead. Due to these altered usage modes involving more complex query types, it is questionable whether today's blockchain systems are prepared for this type of usage and whether such queries can be processed efficiently by them. In our paper, we therefore investigate novel use cases for blockchain systems and elicit their requirements towards a data store in terms of query capabilities. We reflect the state of the art in terms of query support in blockchain systems and assess whether it is capable of meeting the requirements of such more sophisticated use cases. As a result, we identify future research challenges with regard to query processing in blockchain systems.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-04&engl=0}
}
@article {ART-2021-03,
   author = {Rebecca Eichler and Corinna Giebler and Christoph Gr{\"o}ger and Holger Schwarz and Bernhard Mitschang},
   title = {{Modeling metadata in data lakes—A generic model}},
   journal = {Data \& Knowledge Engineering},
   publisher = {Elsevier},
   volume = {136},
   pages = {1--17},
   type = {Artikel in Zeitschrift},
   month = {November},
   year = {2021},
   issn = {0169-023X},
   doi = {10.1016/j.datak.2021.101931},
   keywords = {Metadata management; Metadata model; Data lake; Data management; Data lake zones; Metadata classification},
   language = {Englisch},
   cr-category = {H.2 Database Management},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Data contains important knowledge and has the potential to provide new insights. Due to new technological developments such as the Internet of Things, data is generated in increasing volumes. In order to deal with these data volumes and extract the data{\^a}{\^a}‚¬{\^a}„¢s value new concepts such as the data lake were created. The data lake is a data management platform designed to handle data at scale for analytical purposes. To prevent a data lake from becoming inoperable and turning into a data swamp, metadata management is needed. To store and handle metadata, a generic metadata model is required that can reflect metadata of any potential metadata management use case, e.g., data versioning or data lineage. However, an evaluation of existent metadata models yields that none so far are sufficiently generic as their design basis is not suited. In this work, we use a different design approach to build HANDLE, a generic metadata model for data lakes. The new metadata model supports the acquisition of metadata on varying granular levels, any metadata categorization, including the acquisition of both metadata that belongs to a specific data element as well as metadata that applies to a broader range of data. HANDLE supports the flexible integration of metadata and can reflect the same metadata in various ways according to the intended utilization. Furthermore, it is created for data lakes and therefore also supports data lake characteristics like data lake zones. With these capabilities HANDLE enables comprehensive metadata management in data lakes. HANDLE{\^a}{\^a}‚¬{\^a}„¢s feasibility is shown through the application to an exemplary access-use-case and a prototypical implementation. By comparing HANDLE with existing models we demonstrate that it can provide the same information as the other models as well as adding further capabilities needed for metadata management in data lakes.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=ART-2021-03&engl=0}
}
@inbook {INBOOK-2021-01,
   author = {Dimitri Petrik and Mathias Mormul and Peter Reimann and Christoph Gr{\"o}ger},
   title = {{Anforderungen f{\"u}r Zeitreihendatenbanken im industriellen IoT}},
   series = {IoT – Best Practices},
   publisher = {Springer-Verlag},
   pages = {339--377},
   type = {Beitrag in Buch},
   month = {Mai},
   year = {2021},
   keywords = {Zeitreihendaten; Zeitreihendatenbanken; Industrial IoT; Edge Computing; Data Lake; InfluxDB},
   language = {Deutsch},
   cr-category = {H.2.8 Database Applications},
   department = {Universit{\"a}t Stuttgart, Institut f{\"u}r Parallele und Verteilte Systeme, Anwendersoftware},
   abstract = {Das industrielle Internet der Dinge (IIoT) integriert Informations- und Kommunikationstechnologien in industrielle Prozesse und erweitert sie durch Echtzeit-Datenanalyse. Hierbei sind sensorbasierte Zeitreihen ein wesentlicher Typ von Daten, die in der industriellen Fertigung generiert werden. Sensorbasierte Zeitreihendaten werden in regelm{\"a}{\ss}igen Abst{\"a}nden generiert und enthalten zus{\"a}tzlich zum Sensorwert einen Zeitstempel. Spezielle Zeitreihen-Datenbanken (eng.: Time Series Databases (TSDB)) sind daf{\"u}r ausgelegt, Zeitreihendaten effizient zu speichern. Wenn TSDBs maschinennah, d. h. in der industriellen Edge, eingesetzt werden, sind Maschinendaten zur {\"U}berwachung zeitkritischer Prozesse aufgrund der niedrigen Latenz schnell verf{\"u}gbar, was die erforderliche Zeit f{\"u}r die Datenverarbeitung reduziert. Andererseits k{\"o}nnen TSDBs auch in den Data Lakes als skalierbaren Datenplattformen zur Speicherung und Analyse von Rohdaten zum Einsatz kommen, um die langfristige Vorhaltung von Zeitreihendaten zu erm{\"o}glichen. Bisherige Untersuchungen zu TSDBs sind bei der Auswahl f{\"u}r den Einsatz in der industriellen Edge und im Data Lake nicht vorhanden. Die meisten verf{\"u}gbaren Benchmarks von TSDBs sind performanceorientiert und ber{\"u}cksichtigen nicht die Randbedingungen einer industriellen Edge oder eines Data Lake. Wir adressieren diese L{\"u}cke und identifizieren funktionale Kriterien f{\"u}r den Einsatz von TSDBs in diesen beiden Umgebungen und bilden somit einen qualitativen Kriterienkatalog. Des Weiteren zeigen wir am Beispiel von InfluxDB, wie dieser Katalog verwendet werden kann, mit dem Ziel die systematische Auswahl einer passenden TSDB f{\"u}r den Einsatz in der Edge und im Data Lake zu unterst{\"u}tzen.},
   url = {http://www2.informatik.uni-stuttgart.de/cgi-bin/NCSTRL/NCSTRL_view.pl?id=INBOOK-2021-01&engl=0}
}
 
Zum Seitenanfang