com.liveramp.hank.hadoop.HadoopDomainCompactor.java Source code

Java tutorial

Introduction

Here is the source code for com.liveramp.hank.hadoop.HadoopDomainCompactor.java

Source

/**
 * Copyright 2011 LiveRamp
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.liveramp.hank.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.liveramp.cascading_ext.util.LocalityHelper;
import com.liveramp.hank.config.CoordinatorConfigurator;
import com.liveramp.hank.config.InvalidConfigurationException;
import com.liveramp.hank.config.yaml.YamlCoordinatorConfigurator;
import com.liveramp.hank.coordinator.Coordinator;
import com.liveramp.hank.coordinator.Domain;
import com.liveramp.hank.coordinator.DomainVersion;
import com.liveramp.hank.coordinator.RunWithCoordinator;
import com.liveramp.hank.coordinator.RunnableWithCoordinator;
import com.liveramp.hank.partition_server.DiskPartitionAssignment;
import com.liveramp.hank.storage.Compactor;
import com.liveramp.hank.storage.StorageEngine;
import com.liveramp.hank.storage.incremental.IncrementalDomainVersionProperties;
import com.liveramp.hank.storage.incremental.IncrementalStorageEngine;
import com.liveramp.hank.storage.incremental.IncrementalUpdatePlan;
import com.liveramp.hank.storage.incremental.IncrementalUpdatePlanner;
import com.liveramp.hank.util.CommandLineChecker;

public class HadoopDomainCompactor extends AbstractHadoopDomainBuilder {

    private static final Logger LOG = LoggerFactory.getLogger(HadoopDomainCompactor.class);

    public HadoopDomainCompactor() {
        super();
    }

    public HadoopDomainCompactor(JobConf conf) {
        super(conf);
    }

    @Override
    protected void configureJob(JobConf conf) {
        // Input format
        conf.setInputFormat(HadoopDomainCompactorInputFormat.class);

        // Mappers
        conf.setMapperClass(HadoopDomainCompactorMapper.class);
        conf.setMapOutputKeyClass(KeyAndPartitionWritable.class);
        conf.setMapOutputValueClass(ValueWritable.class);

        // No reducers
        conf.setNumReduceTasks(0);

        // Output
        conf.setOutputKeyClass(KeyAndPartitionWritable.class);
        conf.setOutputValueClass(ValueWritable.class);
    }

    private static class HadoopDomainCompactorMapper
            implements Mapper<Text, IntWritable, KeyAndPartitionWritable, ValueWritable> {

        private DomainVersion domainVersionToCompact;
        private StorageEngine storageEngine;
        private File localTmpOutput;
        private Coordinator coordinator;

        @Override
        public void configure(JobConf conf) {
            // Create unique local directory
            String uniqueString = UUID.randomUUID().toString();
            String localTmpOutputPath;
            try {
                localTmpOutputPath = conf.getLocalPath(uniqueString).toString();
            } catch (IOException e) {
                throw new RuntimeException("Failed to determine local temporary output directory", e);
            }
            localTmpOutput = new File(localTmpOutputPath);
            if (localTmpOutput.exists() || !localTmpOutput.mkdirs()) {
                throw new RuntimeException(
                        "Failed to initialize local temporary output directory " + localTmpOutputPath);
            }
            // Load configuration items
            final String domainName = DomainBuilderProperties.getDomainName(conf);
            final int versionNumberToCompact = DomainCompactorProperties.getVersionNumberToCompact(domainName,
                    conf);

            // Create Coordinator
            coordinator = RunWithCoordinator.createCoordinator(DomainBuilderProperties.getConfigurator(conf));

            // Determine version to compact
            try {
                Domain domain = DomainBuilderProperties.getDomain(coordinator, domainName);
                HadoopDomainCompactorMapper.this.storageEngine = domain.getStorageEngine();
                HadoopDomainCompactorMapper.this.domainVersionToCompact = DomainBuilderProperties
                        .getDomainVersion(coordinator, domainName, versionNumberToCompact);
            } catch (IOException e) {
                throw new RuntimeException("Failed to load configuration.", e);
            }
        }

        @Override
        public void map(Text domainName, IntWritable partitionNumber,
                OutputCollector<KeyAndPartitionWritable, ValueWritable> outputCollector, Reporter reporter)
                throws IOException {
            LOG.info("Compacting Domain " + domainName.toString() + " Version "
                    + domainVersionToCompact.getVersionNumber() + " Partition " + partitionNumber.get() + " in "
                    + localTmpOutput.getAbsolutePath());
            // Get compacting updater

            DiskPartitionAssignment assignment = new DiskPartitionAssignment(
                    Collections.singletonMap(partitionNumber.get(), localTmpOutput.getAbsolutePath()));

            Compactor compactor = storageEngine.getCompactor(assignment, partitionNumber.get());
            if (compactor == null) {
                throw new RuntimeException("Failed to load compacting updater for domain " + domainName
                        + " with storage engine: " + storageEngine);
            }
            // Close coordinator when possible
            compactor.closeCoordinatorOpportunistically(coordinator);
            // Perform compaction
            compactor.compact(domainVersionToCompact,
                    new OutputCollectorWriter(reporter, partitionNumber, outputCollector));
        }

        @Override
        public void close() throws IOException {
            LOG.info("Deleting local temporary directory " + localTmpOutput.getAbsolutePath());
            FileUtils.deleteDirectory(localTmpOutput);
            if (coordinator != null) {
                coordinator.close();
            }
        }
    }

    private static class HadoopDomainCompactorInputSplit implements InputSplit {

        private String domainName;
        private int partitionNumber;
        private String[] locations;

        public HadoopDomainCompactorInputSplit() {
        }

        public HadoopDomainCompactorInputSplit(String domainName, int partitionNumber, String[] locations) {
            this.domainName = domainName;
            this.partitionNumber = partitionNumber;
            this.locations = locations;
        }

        @Override
        public long getLength() throws IOException {
            return 1;
        }

        @Override
        public String[] getLocations() throws IOException {
            return locations;
        }

        @Override
        public void write(DataOutput dataOutput) throws IOException {
            WritableUtils.writeString(dataOutput, domainName);
            WritableUtils.writeVInt(dataOutput, partitionNumber);
            WritableUtils.writeStringArray(dataOutput, locations);
        }

        @Override
        public void readFields(DataInput dataInput) throws IOException {
            domainName = WritableUtils.readString(dataInput);
            partitionNumber = WritableUtils.readVInt(dataInput);
            locations = WritableUtils.readStringArray(dataInput);
        }

        public String getDomainName() {
            return domainName;
        }

        public int getPartitionNumber() {
            return partitionNumber;
        }
    }

    private static class HadoopDomainCompactorInputFormat implements InputFormat<Text, IntWritable> {

        private Domain domain;
        private DomainVersion domainVersionToCompact;

        @Override
        public InputSplit[] getSplits(final JobConf conf, int ignored) throws IOException {
            final String domainName = DomainBuilderProperties.getDomainName(conf);
            RunWithCoordinator.run(DomainBuilderProperties.getConfigurator(conf), new RunnableWithCoordinator() {
                @Override
                public void run(Coordinator coordinator) throws IOException {
                    domain = DomainBuilderProperties.getDomain(coordinator, domainName);
                    domainVersionToCompact = domain
                            .getVersion(DomainCompactorProperties.getVersionNumberToCompact(domainName, conf));
                }
            });

            final int domainNumParts = domain.getNumParts();
            final StorageEngine storageEngine = domain.getStorageEngine();
            final InputSplit[] splits = new InputSplit[domainNumParts];

            // Create splits
            for (int partition = 0; partition < domainNumParts; ++partition) {

                // Compute remote partition file paths for this split if possible
                String[] locations = new String[] {};
                if (storageEngine instanceof IncrementalStorageEngine) {
                    IncrementalUpdatePlanner updatePlanner = ((IncrementalStorageEngine) storageEngine)
                            .getUpdatePlanner(domain);
                    IncrementalUpdatePlan updatePlan = updatePlanner.computeUpdatePlan(domainVersionToCompact);
                    List<String> paths = updatePlanner.getRemotePartitionFilePaths(updatePlan, storageEngine
                            .getPartitionRemoteFileOps(StorageEngine.RemoteLocation.DOMAIN_BUILDER, partition));
                    locations = LocalityHelper.getHostsSortedByLocality(paths, conf);
                }

                splits[partition] = new HadoopDomainCompactorInputSplit(domainName, partition, locations);
            }
            return splits;
        }

        @Override
        public RecordReader<Text, IntWritable> getRecordReader(InputSplit inputSplit, JobConf conf,
                Reporter reporter) throws IOException {
            HadoopDomainCompactorInputSplit split = (HadoopDomainCompactorInputSplit) inputSplit;
            return new HadoopDomainCompactorRecordReader(split);
        }
    }

    private static class HadoopDomainCompactorRecordReader implements RecordReader<Text, IntWritable> {

        private final HadoopDomainCompactorInputSplit split;
        private boolean done = false;

        public HadoopDomainCompactorRecordReader(HadoopDomainCompactorInputSplit split) {
            this.split = split;
        }

        @Override
        public boolean next(Text domainName, IntWritable partitionNumber) throws IOException {
            if (done) {
                return false;
            } else {
                domainName.set(split.getDomainName());
                partitionNumber.set(split.getPartitionNumber());
                done = true;
                return true;
            }
        }

        @Override
        public Text createKey() {
            return new Text();
        }

        @Override
        public IntWritable createValue() {
            return new IntWritable();
        }

        @Override
        public long getPos() throws IOException {
            return 0;
        }

        @Override
        public void close() throws IOException {
        }

        @Override
        public float getProgress() throws IOException {
            if (done) {
                return 1;
            } else {
                return 0;
            }
        }
    }

    public static void main(String[] args) throws IOException, InvalidConfigurationException {
        CommandLineChecker.check(args, new String[] { "domain name", "version to compact number",
                "mapred.task.timeout", "config path", "jobjar" }, HadoopDomainCompactor.class);
        String domainName = args[0];
        Integer versionToCompactNumber = Integer.valueOf(args[1]);
        Integer mapredTaskTimeout = Integer.valueOf(args[2]);
        CoordinatorConfigurator configurator = new YamlCoordinatorConfigurator(args[3]);
        String jobJar = args[4];

        DomainCompactorProperties properties = new DomainCompactorProperties(domainName, versionToCompactNumber,
                configurator);
        JobConf conf = new JobConf();
        conf.setJar(jobJar);
        conf.set("mapred.task.timeout", mapredTaskTimeout.toString());
        conf.setJobName(HadoopDomainCompactor.class.getSimpleName() + " Domain " + domainName + ", Version "
                + versionToCompactNumber);
        HadoopDomainCompactor compactor = new HadoopDomainCompactor(conf);
        LOG.info("Compacting Hank domain " + domainName + " version " + versionToCompactNumber
                + " with coordinator configuration " + configurator);
        compactor.buildHankDomain(properties,
                new IncrementalDomainVersionProperties.Base("Version " + versionToCompactNumber + " compacted"));
    }
}