Java tutorial
/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.tagging; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Collections; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.IOUtils; import org.apache.mahout.utils.vectors.lucene.Driver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class LuceneTagExtractor { private static final Logger log = LoggerFactory.getLogger(Driver.class); public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("dir").withRequired(true) .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create()) .withDescription("The Lucene directory").withShortName("d").create(); Option outputOpt = obuilder.withLongName("output").withRequired(false) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory").withShortName("o").create(); Option maxOpt = obuilder.withLongName("max").withRequired(false) .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of vectors to output. If not specified, then it will loop over all docs") .withShortName("m").create(); Option fieldOpt = obuilder.withLongName("field").withRequired(true) .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create()) .withDescription("The field in the index").withShortName("f").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt) .withOption(fieldOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File file = new File(cmdLine.getValue(inputOpt).toString()); if (!file.isDirectory()) { throw new IllegalArgumentException(file + " does not exist or is not a directory"); } long maxDocs = Long.MAX_VALUE; if (cmdLine.hasOption(maxOpt)) { maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString()); } if (maxDocs < 0) { throw new IllegalArgumentException("maxDocs must be >= 0"); } String field = cmdLine.getValue(fieldOpt).toString(); PrintWriter out = null; if (cmdLine.hasOption(outputOpt)) { out = new PrintWriter(new FileWriter(cmdLine.getValue(outputOpt).toString())); } else { out = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); } File output = new File("/home/drew/taming-text/delicious/training"); output.mkdirs(); emitTextForTags(file, output); IOUtils.close(Collections.singleton(out)); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } } public static void dumpTags(File file, String field, long maxDocs) throws IOException { Directory dir = FSDirectory.open(file); IndexReader reader = IndexReader.open(dir, true); TermEnum te = reader.terms(new Term(field, "")); do { Term term = te.term(); if (term == null || term.field().equals(field) == false) { break; } System.err.printf("%s %d\n", term.text(), te.docFreq()); } while (te.next()); te.close(); } public static void emitTextForTags(File file, File output) throws IOException { String field = "tag"; Directory dir = FSDirectory.open(file); IndexReader reader = IndexReader.open(dir, true); TermEnum te = reader.terms(new Term(field, "")); StringBuilder buf = new StringBuilder(); do { Term term = te.term(); if (term == null || term.field().equals(field) == false) { break; } if (te.docFreq() > 30) { File f = new File(output, term.text() + ".txt"); PrintWriter pw = new PrintWriter(new FileWriter(f)); System.err.printf("%s %d\n", term.text(), te.docFreq()); TermDocs td = reader.termDocs(term); while (td.next()) { int doc = td.doc(); buf.setLength(0); appendVectorTerms(buf, reader.getTermFreqVector(doc, "description-clustering")); appendVectorTerms(buf, reader.getTermFreqVector(doc, "extended-clustering")); emitTagDoc(term, pw, buf); } pw.close(); } } while (te.next()); te.close(); } public static void emitTagDoc(Term term, PrintWriter pw, StringBuilder b) { if (b.length() < 100) { return; } pw.printf("%s\t%s\n", term.text(), b); } public static void appendVectorTerms(StringBuilder buf, TermFreqVector tv) { if (tv == null) return; String[] terms = tv.getTerms(); int[] frequencies = tv.getTermFrequencies(); for (int j = 0; j < terms.length; j++) { int freq = frequencies[j]; String term = terms[j]; for (int k = 0; k < freq; k++) { buf.append(term).append(' '); } } } public static void emitTermsForTags(PrintWriter out, StringBuilder buf, IndexReader reader, TermFreqVector tv) { if (tv == null) return; String[] terms = tv.getTerms(); for (int j = 0; j < terms.length; j++) { out.printf("%s\t%s\n", terms[j], buf.toString()); } } public static void dumpDocs(File indexDir, PrintWriter out, long maxDocs) throws IOException { Directory dir = FSDirectory.open(indexDir); IndexReader reader = IndexReader.open(dir, true); int max = reader.maxDoc(); StringBuilder buf = new StringBuilder(); for (int i = 0; i < max; i++) { if (!reader.isDeleted(i)) { buf.setLength(0); appendVectorTerms(buf, reader.getTermFreqVector(i, "description-clustering")); appendVectorTerms(buf, reader.getTermFreqVector(i, "extended-clustering")); emitTermsForTags(out, buf, reader, reader.getTermFreqVector(i, "tag")); } } } }