Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.hadoop; import java.io.DataInput; import java.io.DataOutput; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import jobimtext.holing.extractor.JobimAnnotationExtractor; import jobimtext.holing.extractor.JobimExtractorConfiguration; import jobimtext.holing.type.JoBim; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.ComparisonChain; import de.tudarmstadt.lt.n2n.annotators.JoBimPrinter; import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline; import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS; import de.tudarmstadt.lt.utilities.types.RepeatedSentence; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; /** * * @author Steffen Remus */ public class GoogleSyntacticsJob extends Configured implements Tool { private static Logger LOG = LoggerFactory.getLogger(GoogleSyntacticsJob.class); public static void main(String[] args) throws Exception { int res = ToolRunner.run(new GoogleSyntacticsJob(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), GoogleSyntacticsJob.class); conf.setJobName(GoogleSyntacticsJob.class.getSimpleName()); conf.setMapperClass(GoogleSyntacticsJob3Mapper.class); conf.setReducerClass(GoogleSyntacticsJob3Reducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(JoBimFormat.class); conf.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); // delete output path for testing purposes // FileSystem.get(conf).delete(new Path(args[1]), true); String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); conf.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); } String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), conf); JobClient.runJob(conf); return 0; } public static class GoogleSyntacticsJob3Mapper extends MapReduceBase implements Mapper<LongWritable, Text, JoBimFormat, IntWritable> { AnalysisEngine _engine; JobimAnnotationExtractor[] _extractors; @Override public void configure(JobConf job) { try { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(job)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } int maxlength = job.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1); AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, false/*ignore_nn_relations*/, maxlength/* dependecy_path_maxlength (-1 for dynamic pathlength based on arc type, e.g. biarc, triarc, quadarc) */, false/* create_detailed_output */, null/* extractor_configuration */, null/* output_destination */))); _engine = builder.createAggregate(); try { _extractors = new JobimAnnotationExtractor[extractorConfigurationFilesArr.length]; for (int i = 0; i < extractorConfigurationFilesArr.length; i++) _extractors[i] = JobimExtractorConfiguration .getExtractorFromXmlFile(new File(extractorConfigurationFilesArr[i]).getName()); } catch (Exception e) { throw new ResourceInitializationException(e); } } catch (Exception e) { throw new RuntimeException(e); } } private JoBimFormat _jobim = new JoBimFormat(); private IntWritable _count = new IntWritable(); @Override public void map(LongWritable key, Text value, OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException { reporter.progress(); FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String filename = fileSplit.getPath().getName(); String id = filename + ":" + key; String casinput = value.toString(); // run pipeline, get results from tempfile JCas aJCas; try { aJCas = _engine.newJCas(); } catch (ResourceInitializationException e) { LOG.error("Could not initialize cas", e); return; } aJCas.setDocumentText(casinput); DocumentMetaData meta = DocumentMetaData.create(aJCas); meta.setDocumentId(id); meta.setCollectionId(filename); try { _engine.process(aJCas); } catch (AnalysisEngineProcessException e) { LOG.error("Could not process cas", e); return; } reporter.progress(); Collection<RepeatedSentence> covering_annotations = JCasUtil.select(aJCas, RepeatedSentence.class); for (RepeatedSentence covering_annotation : covering_annotations) { int repetitions = covering_annotation.getRepetitionCount(); for (JoBim jb : JoBimPrinter.getJoBims(covering_annotation, false)) { for (JobimAnnotationExtractor extractor : _extractors) { _jobim._jo = extractor.extractKey(jb); _jobim._bim = extractor.extractValues(jb); _count.set(repetitions + 1); output.collect(_jobim, _count); } } } } } public static class GoogleSyntacticsJob3Reducer extends MapReduceBase implements Reducer<JoBimFormat, IntWritable, JoBimFormat, IntWritable> { private IntWritable _count = new IntWritable(); @Override public void reduce(JoBimFormat key, Iterator<IntWritable> values, OutputCollector<JoBimFormat, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } _count.set(sum); output.collect(key, _count); } } public static class JoBimFormat implements WritableComparable<JoBimFormat> { String _jo; String _bim; @Override public void readFields(DataInput in) throws IOException { _jo = in.readUTF(); _bim = in.readUTF(); } @Override public void write(DataOutput out) throws IOException { out.writeUTF(_jo); out.writeUTF(_bim); } @Override public int compareTo(JoBimFormat o) { return ComparisonChain.start().compare(_jo, o._jo).compare(_bim, o._bim).result(); } @Override public boolean equals(Object obj) { if (obj instanceof JoBimFormat) return compareTo((JoBimFormat) obj) == 0; return false; } @Override public int hashCode() { return _jo.hashCode() ^ _bim.hashCode(); } @Override public String toString() { return String.format("%s\t%s", _jo, _bim); } } }