Java tutorial
/* * Copyright 2016 Merck Sharp & Dohme Corp. a subsidiary of Merck & Co., * Inc., Kenilworth, NJ, USA. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.msd.gin.halyard.tools; import com.msd.gin.halyard.common.HalyardTableUtils; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.StringTokenizer; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFParser; import org.eclipse.rdf4j.rio.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParserRegistry; import org.eclipse.rdf4j.rio.Rio; import org.eclipse.rdf4j.rio.helpers.AbstractRDFHandler; import org.eclipse.rdf4j.rio.ntriples.NTriplesUtil; /** * Apache Hadoop MapReduce Tool BulkLoading RDF into HBase * @author Adam Sotona (MSD) */ public class HalyardBulkLoad implements Tool { /** * Property defining number of bits used for HBase region pre-splits calculation for new table */ public static final String SPLIT_BITS_PROPERTY = "halyard.table.splitbits"; /** * Properties defining number of bits used for HBase context region pre-splits calculation for new table */ public static final String CONTEXT_SPLIT_REGEXP = "halyard\\.table\\.context\\.splitbits\\.[0-9]+"; /** * Boolean property skipping RDF parsing errors */ public static final String SKIP_INVALID_PROPERTY = "halyard.parser.skipinvalid"; /** * Boolean property enforcing triples and quads context override with the default context */ public static final String OVERRIDE_CONTEXT_PROPERTY = "halyard.parser.context.override"; /** * Property defining default context for triples (or even for quads when context override is set) */ public static final String DEFAULT_CONTEXT_PROPERTY = "halyard.parser.context.default"; private static final Logger LOG = Logger.getLogger(HalyardBulkLoad.class.getName()); private Configuration conf; /** * Mapper class transforming each parsed Statement into set of HBase KeyValues */ public static class RDFMapper extends Mapper<LongWritable, Statement, ImmutableBytesWritable, KeyValue> { private IRI defaultRdfContext; private boolean overrideRdfContext; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); overrideRdfContext = conf.getBoolean(OVERRIDE_CONTEXT_PROPERTY, false); String defCtx = conf.get(DEFAULT_CONTEXT_PROPERTY); defaultRdfContext = defCtx == null ? null : SimpleValueFactory.getInstance().createIRI(defCtx); } @Override protected void map(LongWritable key, Statement value, final Context context) throws IOException, InterruptedException { Resource rdfContext; if (overrideRdfContext || (rdfContext = value.getContext()) == null) { rdfContext = defaultRdfContext; } for (KeyValue keyValue : HalyardTableUtils.toKeyValues(value.getSubject(), value.getPredicate(), value.getObject(), rdfContext)) { context.write(new ImmutableBytesWritable(keyValue.getRowArray(), keyValue.getRowOffset(), keyValue.getRowLength()), keyValue); } } } @Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_path(s)> <output_path> <table_name>"); return -1; } TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; } /** * MapReduce FileInputFormat reading and parsing any RDF4J RIO supported RDF format into Statements */ public static final class RioFileInputFormat extends FileInputFormat<LongWritable, Statement> { /** * Default constructor of RioFileInputFormat */ public RioFileInputFormat() { //this is a workaround to avoid autodetection of .xml files as TriX format and hook on .trix file extension only RDFParserRegistry reg = RDFParserRegistry.getInstance(); Optional<RDFParserFactory> trixPF = reg.get(RDFFormat.TRIX); if (trixPF.isPresent()) { reg.remove(trixPF.get()); final RDFParser trixParser = trixPF.get().getParser(); reg.add(new RDFParserFactory() { @Override public RDFFormat getRDFFormat() { RDFFormat t = RDFFormat.TRIX; return new RDFFormat(t.getName(), t.getMIMETypes(), t.getCharset(), Arrays.asList("trix"), t.getStandardURI(), t.supportsNamespaces(), t.supportsNamespaces()); } @Override public RDFParser getParser() { return trixParser; } }); } } @Override protected boolean isSplitable(JobContext context, Path file) { return false; } @Override protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> filteredList = new ArrayList<>(); for (FileStatus fs : super.listStatus(job)) { if (Rio.getParserFormatForFileName(fs.getPath().getName()) != null) { filteredList.add(fs); } } return filteredList; } @Override public RecordReader<LongWritable, Statement> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { return new RecordReader<LongWritable, Statement>() { private final AtomicLong key = new AtomicLong(); private boolean skipInvalid = false; private ParserPump pump = null; private Statement current = null; private Thread pumpThread = null; @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus("Parsing " + ((FileSplit) split).getPath().toString()); skipInvalid = context.getConfiguration().getBoolean(SKIP_INVALID_PROPERTY, false); close(); pump = null; try { pump = new ParserPump((FileSplit) split, context.getConfiguration()); pumpThread = new Thread(pump); pumpThread.setDaemon(true); pumpThread.start(); } catch (IOException e) { if (skipInvalid) { LOG.log(Level.WARNING, "Exception while initalising RDF parser for " + ((FileSplit) split).getPath().toString(), e); } else { throw e; } } } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (pump == null) return false; current = null; try { current = pump.getNext(); } catch (IOException e) { if (skipInvalid) { LOG.log(Level.WARNING, "Exception while parsing RDF", e); } else { throw e; } } key.incrementAndGet(); return current != null; } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return current == null ? null : new LongWritable(key.get()); } @Override public Statement getCurrentValue() throws IOException, InterruptedException { return current; } @Override public float getProgress() throws IOException, InterruptedException { return pump == null ? 0 : pump.getProgress(); } @Override public void close() throws IOException { if (pump != null) { pump.close(); pump = null; } if (pumpThread != null) { pumpThread.interrupt(); pumpThread = null; } } }; } } private static final IRI NOP = SimpleValueFactory.getInstance().createIRI(":"); private static final Statement END_STATEMENT = SimpleValueFactory.getInstance().createStatement(NOP, NOP, NOP); private static final class ParserPump extends AbstractRDFHandler implements Closeable, Runnable { private final String baseUri; private final Seekable seek; private final InputStream in; private final long size; private final SynchronousQueue<Statement> queue = new SynchronousQueue<>(); private final boolean skipInvalid; private Exception ex = null; public ParserPump(FileSplit split, Configuration conf) throws IOException { this.size = split.getLength(); Path file = split.getPath(); this.baseUri = file.toString(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); this.seek = fileIn; CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file); if (codec != null) { this.in = codec.createInputStream(fileIn, CodecPool.getDecompressor(codec)); } else { this.in = fileIn; } this.skipInvalid = conf.getBoolean(SKIP_INVALID_PROPERTY, false); } public Statement getNext() throws IOException, InterruptedException { Statement s = queue.take(); if (ex != null) { throw new IOException("Exception while parsing: " + baseUri, ex); } return s == END_STATEMENT ? null : s; } public float getProgress() throws IOException { return (float) seek.getPos() / (float) size; } @Override public void run() { try { RDFParser parser = Rio.createParser(Rio.getParserFormatForFileName(baseUri).get()); parser.setRDFHandler(this); parser.setStopAtFirstError(!skipInvalid); parser.parse(in, baseUri); } catch (Exception e) { ex = e; } finally { try { queue.put(END_STATEMENT); } catch (InterruptedException ignore) { } } } @Override public void handleStatement(Statement st) throws RDFHandlerException { try { queue.put(st); } catch (InterruptedException e) { throw new RDFHandlerException(e); } } @Override public void close() throws IOException { in.close(); } } @Override public Configuration getConf() { return this.conf; } @Override public void setConf(final Configuration c) { this.conf = c; } /** * Main of the HalyardBulkLoad * @param args String command line arguments * @throws Exception throws Exception in case of any problem */ public static void main(String[] args) throws Exception { System.exit(ToolRunner.run(new Configuration(), new HalyardBulkLoad(), args)); } }