Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.hadoop.pipetests; import java.io.DataInput; import java.io.DataOutput; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.Iterator; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.ComparisonChain; import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline; import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; /** * * @author Steffen Remus */ public class GoogleSyntacticsJob2 extends Configured implements Tool { private static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticsJob2.class); public static void main(String[] args) throws Exception { int res = ToolRunner.run(new GoogleSyntacticsJob2(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob2.class); conf.setJobName(GoogleSyntacticsJob2.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob2Mapper.class); conf.setReducerClass(GoogleSyntacticsJob2Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // conf.setMapOutputKeyClass(Text.class); // conf.setMapOutputValueClass(NullWritable.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); args = new GenericOptionsParser(conf, args).getRemainingArgs(); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); } String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; } public static class GoogleSyntacticsJob2Mapper extends MapReduceBase implements Mapper<LongWritable, Text, JoBimFormat, IntWritable> { File _tempfile; AnalysisEngine _engine; @Override public void configure(JobConf job) { try { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(job)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } _tempfile = File.createTempFile("casconsumer", null); AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, true/*ignore_nn_relations*/, 5/* dependecy_path_maxlength */, false/* create_detailed_output */, extractorConfigurationFilesArr/* extractor_configuration */, _tempfile.getAbsolutePath()/* output_destination */))); _engine = builder.createAggregate(); } catch (Exception e) { throw new RuntimeException(e); } } private JoBimFormat _jobim = new JoBimFormat(); private IntWritable _count = new IntWritable(); @Override public void map(LongWritable key, Text value, OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException { String casinput = value.toString(); // run pipeline, get results from tempfile JCas aJCas; try { aJCas = _engine.newJCas(); } catch (ResourceInitializationException e) { LOG.error("Could not initialize cas", e); return; } aJCas.setDocumentText(casinput); DocumentMetaData meta = DocumentMetaData.create(aJCas); meta.setDocumentId(key.toString()); try { _engine.process(aJCas); } catch (AnalysisEngineProcessException e) { LOG.error("Could not process cas", e); return; } LineIterator iter = new LineIterator(new FileReader(_tempfile)); while (iter.hasNext()) { String line = iter.nextLine(); String[] splits = line.split("\t"); _jobim._jo = splits[0]; _jobim._bim = splits[1]; _count.set(Integer.parseInt(splits[2])); output.collect(_jobim, _count); } iter.close(); try { _tempfile.delete(); } catch (Exception e) { LOG.error("Could not delete tempfile.", e); } } } public static class GoogleSyntacticsJob2Reducer extends MapReduceBase implements Reducer<JoBimFormat, IntWritable, JoBimFormat, IntWritable> { private IntWritable _count = new IntWritable(); @Override public void reduce(JoBimFormat key, Iterator<IntWritable> values, OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } _count.set(sum); output.collect(key, _count); } } public static class JoBimFormat implements WritableComparable<JoBimFormat> { String _jo; String _bim; @Override public void readFields(DataInput in) throws IOException { _jo = in.readUTF(); _bim = in.readUTF(); } @Override public void write(DataOutput out) throws IOException { out.writeUTF(_jo); out.writeUTF(_bim); } @Override public int compareTo(JoBimFormat o) { return ComparisonChain.start().compare(_jo, o._jo).compare(_bim, o._bim).result(); } @Override public boolean equals(Object obj) { if (obj instanceof JoBimFormat) return compareTo((JoBimFormat) obj) == 0; return false; } @Override public int hashCode() { return _jo.hashCode() ^ _bim.hashCode(); } @Override public String toString() { return String.format("%s\t%s", _jo, _bim); } } }