Java tutorial
/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.qa; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; import org.apache.solr.common.SolrInputDocument; import java.io.File; import java.io.FilenameFilter; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Properties; import java.util.TimeZone; /** * Take in the Lucene Wikipedia benchmark docs and index them in Solr */ public class WikipediaIndexer { private transient static Log log = LogFactory.getLog(WikipediaIndexer.class); private static final String LINE_SEP = System.getProperty("line.separator"); private SolrServer server; public static final String DEFAULT_SOLR_URL = "http://localhost:8983/solr"; public WikipediaIndexer() throws MalformedURLException { server = new CommonsHttpSolrServer(DEFAULT_SOLR_URL); } public WikipediaIndexer(SolrServer server) throws MalformedURLException { this.server = server; } private static SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yyyy"); private static SimpleDateFormat solrFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", Locale.US); public static TimeZone UTC = TimeZone.getTimeZone("UTC"); public int index(File wikipediaXML) throws Exception { return index(wikipediaXML, Integer.MAX_VALUE, 1000); } public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs(); //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(i)); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; } public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option wikipediaFileOpt = obuilder.withLongName("wikiFile").withRequired(true) .withArgument(abuilder.withName("wikiFile").withMinimum(1).withMaximum(1).create()) .withDescription( "The path to the wikipedia dump file. Maybe a directory containing wikipedia dump files." + " If a directory is specified, only .xml files are used.") .withShortName("w").create(); Option numDocsOpt = obuilder.withLongName("numDocs").withRequired(false) .withArgument(abuilder.withName("numDocs").withMinimum(1).withMaximum(1).create()) .withDescription("The number of docs to index").withShortName("n").create(); Option solrURLOpt = obuilder.withLongName("solrURL").withRequired(false) .withArgument(abuilder.withName("solrURL").withMinimum(1).withMaximum(1).create()) .withDescription("The URL where Solr lives").withShortName("s").create(); Option solrBatchOpt = obuilder.withLongName("batch").withRequired(false) .withArgument(abuilder.withName("batch").withMinimum(1).withMaximum(1).create()) .withDescription("The number of docs to include in each indexing batch").withShortName("b") .create(); Group group = gbuilder.withName("Options").withOption(wikipediaFileOpt).withOption(numDocsOpt) .withOption(solrURLOpt).withOption(solrBatchOpt).create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); File file; file = new File(cmdLine.getValue(wikipediaFileOpt).toString()); File[] dumpFiles; if (file.isDirectory()) { dumpFiles = file.listFiles(new FilenameFilter() { public boolean accept(File file, String s) { return s.endsWith(".xml"); } }); } else { dumpFiles = new File[] { file }; } int numDocs = Integer.MAX_VALUE; if (cmdLine.hasOption(numDocsOpt)) { numDocs = Integer.parseInt(cmdLine.getValue(numDocsOpt).toString()); } String url = DEFAULT_SOLR_URL; if (cmdLine.hasOption(solrURLOpt)) { url = cmdLine.getValue(solrURLOpt).toString(); } int batch = 100; if (cmdLine.hasOption(solrBatchOpt)) { batch = Integer.parseInt(cmdLine.getValue(solrBatchOpt).toString()); } WikipediaIndexer indexer = new WikipediaIndexer(new CommonsHttpSolrServer(url)); int total = 0; for (int i = 0; i < dumpFiles.length && total < numDocs; i++) { File dumpFile = dumpFiles[i]; log.info("Indexing: " + file + " Num files to index: " + (numDocs - total)); long start = System.currentTimeMillis(); int totalFile = indexer.index(dumpFile, numDocs - total, batch); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing " + dumpFile + " took " + (finish - start) + " ms"); } total += totalFile; log.info("Done Indexing: " + file + ". Indexed " + totalFile + " docs for that file and " + total + " overall."); } log.info("Indexed " + total + " docs overall."); } }