Java tutorial
/* * Copyright 2012 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.n2n.hadoop; import java.io.File; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.util.ToolRunner; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline; import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS; import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver; import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat; import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentTextExtractor; /** * * @author Steffen Remus */ public class GoogleSyntacticsJobDkbd extends DkproHadoopDriver { static void print_usage(String message) { if (message != null && !"".equals(message)) System.out.println(message); // Usage: HadoopPipe [hadoop-params] input output [num-mappers] System.out.format( "Usage: ... %s -D%s=<extractor-configuration-file1>,<extractor-configuration-file2>,... input output [num-mappers] %n", GoogleSyntacticsJobDkbd.class.getName(), SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); } public static void main(String[] args) { try { ToolRunner.run(new Configuration(), new GoogleSyntacticsJobDkbd(), args); } catch (Exception e) { e.printStackTrace(); } } @Override public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException { try { String extractorConfigurationFiles = conf.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) { String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName(); for (Path p : DistributedCache.getLocalCacheFiles(conf)) if (p.getName().contains(extractorConfigurationFileName)) extractorConfigurationFilesArr[i] = p.toString(); } int maxlength = conf.getInt(SHARED_CONSTANTS.PARAM_MAXIMUM_PATHLENGTH, -1); AggregateBuilder builder = new AggregateBuilder(); // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class)); builder.add(AnalysisEngineFactory.createEngineDescription(JoBimRelationPipeline .createGoogleSyntacticsRelationEngine(true/* create_tokens */, true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */, true/* create_dependency_path */, false/*ignore_nn_relations*/, maxlength/* dependecy_path_maxlength */, false/* create_detailed_output */, extractorConfigurationFilesArr/* extractor_configuration */, SHARED_CONSTANTS.HADOOP_CAS_CONSUMER_OUTPUT_FILENAME/* output_destination */))); return builder.createAggregateDescription(); } catch (IOException e) { throw new ResourceInitializationException(e); } } @Override public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException { return null; } @Override public Class<?> getInputFormatClass() { return Text2CASInputFormat.class; } @Override public void configure(JobConf job) { String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS); if (extractorConfigurationFiles == null) { extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ','); System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n", SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles); } try { String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(","); for (int i = 0; i < extractorConfigurationFilesArr.length; i++) DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job); } catch (IOException e) { e.printStackTrace(); } Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class); job.setOutputFormat(NullOutputFormat.class); // ignore the serialized cas and use only the output from the CasConsumer } public static class KeyPlusValueAsDocumentExtractor implements DocumentTextExtractor { private static Text _text = new Text(); @Override public Text extractDocumentText(Text key, Text value) { _text.set(key.toString() + "\t" + value.toString()); return _text; } } }