Java tutorial
/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.qa; import java.io.File; import java.io.FilenameFilter; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.mahout.common.CommandLineUtil; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; import org.apache.solr.common.SolrInputDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class WikipediaWexIndexer { private transient static Logger log = LoggerFactory.getLogger(WikipediaWexIndexer.class); private SolrServer server; public static final String DEFAULT_SOLR_URL = "http://localhost:8983/solr"; public WikipediaWexIndexer() throws MalformedURLException { this.server = new CommonsHttpSolrServer(DEFAULT_SOLR_URL); } public WikipediaWexIndexer(SolrServer server) throws MalformedURLException { this.server = server; } public int index(File wikipediaWEX) throws Exception { return index(wikipediaWEX, Integer.MAX_VALUE, 1000); } public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaWEX != null && wikipediaWEX.isFile()) { WexWikiContentSource contentSource = new WexWikiContentSource(); Properties properties = new Properties(); // fileName = config.get("docs.file", null); String filePath = wikipediaWEX.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs(); // docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(docData.getID())); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); String[] categories = docData.getProps().getProperty("category").split(";;"); for (String c : categories) { sDoc.addField("category", c); } if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaWEX); } return result; } public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option wikipediaFileOpt = obuilder.withLongName("wikiFile").withRequired(true) .withArgument(abuilder.withName("wikiFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file. " + "May be a directory containing wikipedia dump files. " + "If a directory is specified, files starting with the prefix " + "freebase-segment- are used.") .withShortName("w").create(); Option numDocsOpt = obuilder.withLongName("numDocs").withRequired(false) .withArgument(abuilder.withName("numDocs").withMinimum(1).withMaximum(1).create()) .withDescription("The number of docs to index").withShortName("n").create(); Option solrURLOpt = obuilder.withLongName("solrURL").withRequired(false) .withArgument(abuilder.withName("solrURL").withMinimum(1).withMaximum(1).create()) .withDescription("The URL where Solr lives").withShortName("s").create(); Option solrBatchOpt = obuilder.withLongName("batch").withRequired(false) .withArgument(abuilder.withName("batch").withMinimum(1).withMaximum(1).create()) .withDescription("The number of docs to include in each indexing batch").withShortName("b") .create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(wikipediaFileOpt).withOption(numDocsOpt) .withOption(solrURLOpt).withOption(solrBatchOpt).withOption(helpOpt).create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File file; file = new File(cmdLine.getValue(wikipediaFileOpt).toString()); File[] dumpFiles; if (file.isDirectory()) { dumpFiles = file.listFiles(new FilenameFilter() { public boolean accept(File file, String s) { return s.startsWith("freebase-segment-"); } }); } else { dumpFiles = new File[] { file }; } int numDocs = Integer.MAX_VALUE; if (cmdLine.hasOption(numDocsOpt)) { numDocs = Integer.parseInt(cmdLine.getValue(numDocsOpt).toString()); } String url = DEFAULT_SOLR_URL; if (cmdLine.hasOption(solrURLOpt)) { url = cmdLine.getValue(solrURLOpt).toString(); } int batch = 100; if (cmdLine.hasOption(solrBatchOpt)) { batch = Integer.parseInt(cmdLine.getValue(solrBatchOpt).toString()); } WikipediaWexIndexer indexer = new WikipediaWexIndexer(new CommonsHttpSolrServer(url)); int total = 0; for (int i = 0; i < dumpFiles.length && total < numDocs; i++) { File dumpFile = dumpFiles[i]; log.info("Indexing: " + file + " Num files to index: " + (numDocs - total)); long start = System.currentTimeMillis(); int totalFile = indexer.index(dumpFile, numDocs - total, batch); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing " + dumpFile + " took " + (finish - start) + " ms"); } total += totalFile; log.info("Done Indexing: " + file + ". Indexed " + totalFile + " docs for that file and " + total + " overall."); } log.info("Indexed " + total + " docs overall."); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); return; } } }