Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.accumulo.examples.wikisearch.ingest; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.EnumSet; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.Connector; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.IteratorSetting.Column; import org.apache.accumulo.core.client.TableExistsException; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.client.admin.TableOperations; import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope; import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner; import org.apache.accumulo.examples.wikisearch.iterator.TextIndexCombiner; import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class WikipediaIngester extends Configured implements Tool { public final static String INGEST_LANGUAGE = "wikipedia.ingest_language"; public final static String SPLIT_FILE = "wikipedia.split_file"; public final static String TABLE_NAME = "wikipedia.table"; public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new WikipediaIngester(), args); System.exit(res); } public static void createTables(TableOperations tops, String tableName, boolean configureLocalityGroups) throws AccumuloException, AccumuloSecurityException, TableNotFoundException, TableExistsException { // Create the shard table String indexTableName = tableName + "Index"; String reverseIndexTableName = tableName + "ReverseIndex"; String metadataTableName = tableName + "Metadata"; // create the shard table if (!tops.exists(tableName)) { // Set a text index combiner on the given field names. No combiner is set if the option is not supplied String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME; tops.create(tableName); if (textIndexFamilies.length() > 0) { System.out.println("Adding content combiner on the fields: " + textIndexFamilies); IteratorSetting setting = new IteratorSetting(10, TextIndexCombiner.class); List<Column> columns = new ArrayList<Column>(); for (String family : StringUtils.split(textIndexFamilies, ',')) { columns.add(new Column("fi\0" + family)); } TextIndexCombiner.setColumns(setting, columns); TextIndexCombiner.setLossyness(setting, true); tops.attachIterator(tableName, setting, EnumSet.allOf(IteratorScope.class)); } // Set the locality group for the full content column family if (configureLocalityGroups) tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY)))); } if (!tops.exists(indexTableName)) { tops.create(indexTableName); // Add the UID combiner IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class); GlobalIndexUidCombiner.setCombineAllColumns(setting, true); GlobalIndexUidCombiner.setLossyness(setting, true); tops.attachIterator(indexTableName, setting, EnumSet.allOf(IteratorScope.class)); } if (!tops.exists(reverseIndexTableName)) { tops.create(reverseIndexTableName); // Add the UID combiner IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class); GlobalIndexUidCombiner.setCombineAllColumns(setting, true); GlobalIndexUidCombiner.setLossyness(setting, true); tops.attachIterator(reverseIndexTableName, setting, EnumSet.allOf(IteratorScope.class)); } if (!tops.exists(metadataTableName)) { // Add the SummingCombiner with VARLEN encoding for the frequency column tops.create(metadataTableName); IteratorSetting setting = new IteratorSetting(10, SummingCombiner.class); SummingCombiner.setColumns(setting, Collections.singletonList(new Column("f"))); SummingCombiner.setEncodingType(setting, SummingCombiner.Type.VARLEN); tops.attachIterator(metadataTableName, setting, EnumSet.allOf(IteratorScope.class)); } } @Override public int run(String[] args) throws Exception { Job job = new Job(getConf(), "Ingest Wikipedia"); Configuration conf = job.getConfiguration(); conf.set("mapred.map.tasks.speculative.execution", "false"); String tablename = WikipediaConfiguration.getTableName(conf); String zookeepers = WikipediaConfiguration.getZookeepers(conf); String instanceName = WikipediaConfiguration.getInstanceName(conf); String user = WikipediaConfiguration.getUser(conf); byte[] password = WikipediaConfiguration.getPassword(conf); Connector connector = WikipediaConfiguration.getConnector(conf); TableOperations tops = connector.tableOperations(); createTables(tops, tablename, true); configureJob(job); List<Path> inputPaths = new ArrayList<Path>(); SortedSet<String> languages = new TreeSet<String>(); FileSystem fs = FileSystem.get(conf); Path parent = new Path(conf.get("wikipedia.input")); listFiles(parent, fs, inputPaths, languages); System.out.println("Input files in " + parent + ":" + inputPaths.size()); Path[] inputPathsArray = new Path[inputPaths.size()]; inputPaths.toArray(inputPathsArray); System.out.println("Languages:" + languages.size()); FileInputFormat.setInputPaths(job, inputPathsArray); job.setMapperClass(WikipediaMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Mutation.class); job.setOutputFormatClass(AccumuloOutputFormat.class); AccumuloOutputFormat.setOutputInfo(job.getConfiguration(), user, password, true, tablename); AccumuloOutputFormat.setZooKeeperInstance(job.getConfiguration(), instanceName, zookeepers); return job.waitForCompletion(true) ? 0 : 1; } public final static PathFilter partFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().startsWith("part"); }; }; protected void configureJob(Job job) { Configuration conf = job.getConfiguration(); job.setJarByClass(WikipediaIngester.class); job.setInputFormatClass(WikipediaInputFormat.class); conf.set(AggregatingRecordReader.START_TOKEN, "<page>"); conf.set(AggregatingRecordReader.END_TOKEN, "</page>"); } protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?"); protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException { for (FileStatus status : fs.listStatus(path)) { if (status.isDir()) { listFiles(status.getPath(), fs, files, languages); } else { Path p = status.getPath(); Matcher matcher = filePattern.matcher(p.getName()); if (matcher.matches()) { languages.add(matcher.group(1)); files.add(p); } } } } }