Data Compression in Hadoop

1 Overview

Compression strategies and principles

2 MR supported compression coding

Compressed format hadoop comes with itself. algorithm File extension Is it separable Whether the original program needs to be modified after changing to compressed format
DEFLATE Yes, direct use DEFLATE .deflate no As with text processing, no modifications are required
Gzip Yes, direct use DEFLATE .gz no As with text processing, no modifications are required
bzip2 Yes, direct use bzip2 .bz2 yes As with text processing, no modifications are required
LZO No, installation is required LZO .lzo yes Index needs to be built and input format needs to be specified
Snappy No, installation is required Snappy .snappy no As with text processing, no modifications are required

To support a variety of compression / decompression algorithms, Hadoop introduces the encoder / decoder as shown in the following table.

Compressed format Corresponding coding / decoders
DEFLATE org.apache.hadoop.io.compress.DefaultCodec
gzip org.apache.hadoop.io.compress.GzipCodec
bzip2 org.apache.hadoop.io.compress.BZip2Codec
LZO com.hadoop.compression.lzo.LzopCodec
Snappy org.apache.hadoop.io.compress.SnappyCodec

Compression performance comparison

compression algorithm Original file size Compressed file size Compression speed Decompression speed
gzip 8.3GB 1.8GB 17.5MB/s 58MB/s
bzip2 8.3GB 1.1GB 2.4MB/s 9.5MB/s
LZO 8.3GB 2.9GB 49.3MB/s 74.6MB/s

3. Choice of Compression Mode

3.1 Gzip Compression

3.2 Bzip2 compression

3.3 Lzo compression

3.4 Snappy Compression

4 Compression Location Selection

5 Compression Parameter Configuration

parameter Default value stage
io.compression.codecs [at core-site.xml] org.apache.hadoop.io.compress.DefaultCodecorg apache.hadoop.io.compress.GzipCodec org.apache.hadoop.io.compress.BZip2Codec Input compression
mapreduce.map.output.compress [mapred-site.xml] false mapper output
mapreduce.map.output.compress.codec [mapred-site.xml] org.apache.hadoop.io.compress.DefaultCodec mapper output
mapreduce.output.fileoutputformat.compress [mapred-site.xml] false reducer output
mapreduce.output.fileoutputformat.compress.codec [mapred-site.xml] org.apache.hadoop.io.compress DefaultCodec reducer output
mapreduce.output.fileoutputformat.compress.type [mapred-site.xml] RECORD reducer output

6. Compression of Practical Cases

6.1 Data Stream Compression and Decompression

package com.djm.mapreduce.zip;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;

import java.io.*;

public class CompressUtils {
    public static void main(String[] args) throws IOException, ClassNotFoundException {
        compress(args[0], args[1]);
        decompress(args[0]);
    }

    private static void decompress(String path) throws IOException {
        CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
        CompressionCodec codec = (CompressionCodec) factory.getCodec(new Path(path));
        if (codec == null) {
            System.out.println("cannot find codec for file " + path);
            return;
        }
        CompressionInputStream cis = codec.createInputStream(new FileInputStream(new File(path)));
        FileOutputStream fos = new FileOutputStream(new File(path + ".decoded"));
        IOUtils.copyBytes(cis, fos, 1024);
        cis.close();
        fos.close();
    }

    private static void compress(String path, String method) throws IOException, ClassNotFoundException {
        FileInputStream fis = new FileInputStream(new File(path));
        Class codecClass  = Class.forName(method);
        CompressionCodec codec  = (CompressionCodec) ReflectionUtils.newInstance(codecClass, new Configuration());
        FileOutputStream fos = new FileOutputStream(new File(path + codec.getDefaultExtension()));
        CompressionOutputStream cos = codec.createOutputStream(fos);
        IOUtils.copyBytes(fis, cos, 1024);
        cos.close();
        fos.close();
        fis.close();
    }
}

6.2 Map output using compression

package com.djm.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        configuration.setBoolean("mapreduce.map.output.compress", true);
        // Setting up the output compression mode of map terminal
        configuration.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
        Job job = Job.getInstance(configuration);
        job.setJarByClass(WcDriver.class);
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

6.3 Reduce output using compression

package com.djm.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(WcDriver.class);
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // Set reduce output compression open
        FileOutputFormat.setCompressOutput(job, true);
        // Setting the Compression Mode
        FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

Tags: Big Data Hadoop Apache codec xml

Posted on Sun, 06 Oct 2019 22:47:40 -0400 by yepster123