uk.ac.ucl.panda.indexing.io.BasicDocMaker.java Source code

Introduction

Here is the source code for uk.ac.ucl.panda.indexing.io.BasicDocMaker.java
Source

package uk.ac.ucl.panda.indexing.io;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import uk.ac.ucl.panda.indexing.*;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

import uk.ac.ucl.panda.utility.io.Config;
import uk.ac.ucl.panda.utility.io.Format;
import uk.ac.ucl.panda.utility.io.NoMoreDataException;
import uk.ac.ucl.panda.utility.parser.HTMLParser;
import uk.ac.ucl.panda.utility.structure.DocData;
import uk.ac.ucl.panda.utility.structure.Document;
import uk.ac.ucl.panda.utility.structure.Field;

/**
 * Create documents for the test.
 * Maintains counters of chars etc. so that sub-classes just need to 
 * provide textual content, and the create-by-size is handled here.
 *
 * <p/>
 * Config Params (default is in caps):
 * doc.stored=true|FALSE<br/>
 * doc.tokenized=TRUE|false<br/>
 * doc.term.vector=true|FALSE<br/>
 * doc.term.vector.positions=true|FALSE<br/>
 * doc.term.vector.offsets=true|FALSE<br/>
 * doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field<br/>
 */
public abstract class BasicDocMaker implements DocMaker {

    private int numDocsCreated = 0;
    private boolean storeBytes = false;
    protected boolean forever;

    private static class LeftOver {
        private DocData docdata;
        private int cnt;
    }

    // leftovers are thread local, because it is unsafe to share residues between threads
    private ThreadLocal leftovr = new ThreadLocal();

    public static final String BODY_FIELD = "body";
    public static final String TITLE_FIELD = "doctitle";
    public static final String DATE_FIELD = "docdate";
    public static final String ID_FIELD = "docid";
    public static final String BYTES_FIELD = "bytes";
    public static final String NAME_FIELD = "docname";

    private long numBytes = 0;
    private long numUniqueBytes = 0;

    protected Config config;

    protected Field.Store storeVal = Field.Store.NO;
    protected Field.Index indexVal = Field.Index.TOKENIZED;
    protected Field.TermVector termVecVal = Field.TermVector.NO;

    private synchronized int incrNumDocsCreated() {
        return numDocsCreated++;
    }

    /**
     * Return the data of the next document.
     * All current implementations can create docs forever. 
     * When the input data is exhausted, input files are iterated.
     * This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
     * @return data of the next document.
     * @exception if cannot create the next doc data
     * @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
     */
    protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;

    protected abstract DocData getDocData(String path) throws UnsupportedEncodingException, Exception;

    public Document makeDocument(String path) throws UnsupportedEncodingException, Exception {
        DocData docData = getDocData(path);
        Document doc = createDocument(docData, 0, -1);
        return doc;
    }

    /*
     *  (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument()
     */
    public Document makeDocument() throws Exception {
        resetLeftovers();
        DocData docData = getNextDocData();
        ///////////////////
        if (docData == null)
            return null;
        ///////////////////
        Document doc = createDocument(docData, 0, -1);
        return doc;
    }

    // create a doc
    // use only part of the body, modify it to keep the rest (or use all if size==0).
    // reset the docdata properties so they are not added more than once.
    private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
        int docid = incrNumDocsCreated();
        Document doc = new Document();
        doc.add(new Field(ID_FIELD, docid + "", storeVal, indexVal, termVecVal));
        if (docData.getName() != null) {
            String name = (cnt < 0 ? docData.getName() : docData.getName() + "_" + cnt);
            ///////////////////
            doc.add(new Field(NAME_FIELD, name, Field.Store.YES, Field.Index.UN_TOKENIZED, termVecVal));
        }
        if (docData.getDate() != null) {
            String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
            doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
        }
        if (docData.getTitle() != null) {
            doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
        }
        if (docData.getBody() != null && docData.getBody().length() > 0) {
            String bdy;
            if (size <= 0 || size >= docData.getBody().length()) {
                bdy = docData.getBody(); // use all
                docData.setBody(""); // nothing left
            } else {
                // attempt not to break words - if whitespace found within next 20 chars...
                for (int n = size - 1; n < size + 20 && n < docData.getBody().length(); n++) {
                    if (Character.isWhitespace(docData.getBody().charAt(n))) {
                        size = n;
                        break;
                    }
                }
                bdy = docData.getBody().substring(0, size); // use part
                docData.setBody(docData.getBody().substring(size)); // some left
            }
            doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, Field.TermVector.YES));
            if (storeBytes == true) {
                doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
            }
        }

        if (docData.getProps() != null) {
            for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext();) {
                String key = (String) it.next();
                String val = (String) docData.getProps().get(key);
                doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
            }
            docData.setProps(null);
        }
        //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
        return doc;
    }

    /*
     *  (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#makeDocument(int)
     */
    public Document makeDocument(int size) throws Exception {
        LeftOver lvr = (LeftOver) leftovr.get();
        if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null
                || lvr.docdata.getBody().length() == 0) {
            resetLeftovers();
        }
        DocData dd = (lvr == null ? getNextDocData() : lvr.docdata);
        int cnt = (lvr == null ? 0 : lvr.cnt);
        while (dd.getBody() == null || dd.getBody().length() < size) {
            DocData dd2 = dd;
            dd = getNextDocData();
            cnt = 0;
            dd.setBody(dd2.getBody() + dd.getBody());
        }
        Document doc = createDocument(dd, size, cnt);
        if (dd.getBody() == null || dd.getBody().length() == 0) {
            resetLeftovers();
        } else {
            if (lvr == null) {
                lvr = new LeftOver();
                leftovr.set(lvr);
            }
            lvr.docdata = dd;
            lvr.cnt = ++cnt;
        }
        return doc;
    }

    private void resetLeftovers() {
        leftovr.set(null);
    }

    /* (non-Javadoc)
     * @see DocMaker#setConfig(java.util.Properties)
     */
    public void setConfig(Config config) throws IOException {
        this.config = config;
        boolean stored = config.get("doc.stored", false);
        boolean tokenized = config.get("doc.tokenized", true);
        boolean termVec = config.get("doc.term.vector", false);
        storeVal = (stored ? Field.Store.YES : Field.Store.NO);
        indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
        boolean termVecPositions = config.get("doc.term.vector.positions", false);
        boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
        if (termVecPositions && termVecOffsets)
            termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
        else if (termVecPositions)
            termVecVal = Field.TermVector.WITH_POSITIONS;
        else if (termVecOffsets)
            termVecVal = Field.TermVector.WITH_OFFSETS;
        else if (termVec)
            termVecVal = Field.TermVector.YES;
        else
            termVecVal = Field.TermVector.NO;
        storeBytes = config.get("doc.store.body.bytes", false);
        forever = config.get("doc.maker.forever", true);
    }

    /*
     *  (non-Javadoc)
     * @see DocMaker#resetIinputs()
     */
    public synchronized void resetInputs() {
        printDocStatistics();
        numBytes = 0;
        numDocsCreated = 0;
        resetLeftovers();
    }

    /*
     *  (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#numUniqueBytes()
     */
    public long numUniqueBytes() {
        return numUniqueBytes;
    }

    /*
     *  (non-Javadoc)
     * @see DocMaker#getCount()
     */
    public synchronized int getCount() {
        return numDocsCreated;
    }

    /*
     *  (non-Javadoc)
     * @see DocMaker#getByteCount()
     */
    public synchronized long getByteCount() {
        return numBytes;
    }

    protected void addUniqueBytes(long n) {
        numUniqueBytes += n;
    }

    protected synchronized void addBytes(long n) {
        numBytes += n;
    }

    /*
     *  (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#printDocStatistics()
     */
    private int lastPrintedNumUniqueTexts = 0;
    private long lastPrintedNumUniqueBytes = 0;
    private int printNum = 0;
    private HTMLParser htmlParser;

    public void printDocStatistics() {
        boolean print = false;
        String col = "                  ";
        StringBuffer sb = new StringBuffer();
        String newline = System.getProperty("line.separator");
        sb.append("------------> ").append(Format.simpleName(getClass())).append(" statistics (").append(printNum)
                .append("): ").append(newline);
        int nut = numUniqueTexts();
        if (nut > lastPrintedNumUniqueTexts) {
            print = true;
            sb.append("total count of unique texts: ").append(Format.format(0, nut, col)).append(newline);
            lastPrintedNumUniqueTexts = nut;
        }
        long nub = numUniqueBytes();
        if (nub > lastPrintedNumUniqueBytes) {
            print = true;
            sb.append("total bytes of unique texts: ").append(Format.format(0, nub, col)).append(newline);
            lastPrintedNumUniqueBytes = nub;
        }
        if (getCount() > 0) {
            print = true;
            sb.append("num docs added since last inputs reset:   ").append(Format.format(0, getCount(), col))
                    .append(newline);
            sb.append("total bytes added since last inputs reset: ").append(Format.format(0, getByteCount(), col))
                    .append(newline);
        }
        if (print) {
            System.out.println(sb.append(newline).toString());
            printNum++;
        }
    }

    protected void collectFiles(String path, ArrayList inputFiles) throws IOException {

        Path p = new Path(path);
        FileSystem fs = FileSystem.get(new Configuration());
        //System.out.println("Collect: "+f.getAbsolutePath());
        if (!fs.exists(p)) {
            return;
        }
        if (fs.isDirectory(p)) {
            RemoteIterator<LocatedFileStatus> fileIter = fs.listLocatedStatus(p);
            List<String> files = new ArrayList<String>();
            while (fileIter.hasNext()) {
                files.add(fileIter.next().getPath().toString());
            }
            Collections.sort(files);
            for (String f : files) {
                collectFiles(f, inputFiles);
            }
            return;
        }
        //////////////ucl
        if (path.toLowerCase().endsWith("z")) {
            inputFiles.add(path);
            addUniqueBytes(fs.getFileStatus(p).getLen());
        }
    }

    /* (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
     */
    public void setHTMLParser(HTMLParser htmlParser) {
        this.htmlParser = htmlParser;
    }

    /*
     *  (non-Javadoc)
     * @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
     */
    public HTMLParser getHtmlParser() {
        return htmlParser;
    }

}