@inproceedings{51626737902f458b960608d319ba75b6,
title = "On a Small File Merger for Fast Access and Modifiability of Small Files in HDFS",
abstract = "Hadoop Distributed File System (HDFS) was originally designed to store big files and has been widely used in big-data ecosystem. However, it may suffer from serious performance issues when handling a large number of small files. In this paper, we propose a novel archive system, referred to as Small File Merger (SFM), to solve small file problems in HDFS. The key idea is to combine small files into large ones and build an index for accessing original files. Unlike traditional archive systems such as Hadoop Archives (Har), SFM allows modification of archived files directly without re-archiving. Considering that most of the reads in HDFS are sequential, we design an adaptive readahead strategy based on the Simultaneous Perturbation Stochastic Approximation (SPSA) algorithm to maximize read performance. Furthermore, our system provides an HDFS-compatible interface, which can be used directly without recompiling and redeploying the existing HDFS cluster, hence facilitating convenient deployment for practical use. Preliminary experimental results show that our system achieves better performance than existing methods.",
keywords = "Adaptive Readahead, Archive System, Big Data, HDFS, Small File Problems, Stochastic Approximation",
author = "Dingchao Chen and Wu, {Chase Q.} and Wei Shen and Yu Zhang",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 18th IEEE/ACS International Conference on Computer Systems and Applications, AICCSA 2021 ; Conference date: 30-11-2021 Through 03-12-2021",
year = "2021",
doi = "10.1109/AICCSA53542.2021.9686873",
language = "English (US)",
series = "Proceedings of IEEE/ACS International Conference on Computer Systems and Applications, AICCSA",
publisher = "IEEE Computer Society",
booktitle = "2021 IEEE/ACS 18th International Conference on Computer Systems and Applications, AICCSA 2021 - Proceedings",
address = "United States",
}