au.org.ala.bhl.command.ExtractNamesCommand.java Source code

Introduction

Here is the source code for au.org.ala.bhl.command.ExtractNamesCommand.java
Source

/*******************************************************************************
 * Copyright (C) 2011 Atlas of Living Australia
 * All Rights Reserved.
 *   
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *   
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ******************************************************************************/
package au.org.ala.bhl.command;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.List;

import org.apache.commons.cli.Options;
import org.apache.commons.lang.StringUtils;

import au.org.ala.bhl.Command;
import au.org.ala.bhl.IndexerOptions;
import au.org.ala.bhl.LanguageScore;
import au.org.ala.bhl.TaxonGrab;
import au.org.ala.bhl.WordLists;
import au.org.ala.bhl.service.CacheControlBlock;
import au.org.ala.bhl.service.CachedItemPageHandler;
import au.org.ala.bhl.service.DocumentCacheService;
import au.org.ala.bhl.service.ItemsService;

/**
 * Experimental command that attempts to identify taxonomic names in the OCR'd text held in the document cache
 * 
 * @author baird
 *
 */
@Command(name = "extract-names")
public class ExtractNamesCommand extends AbstractCommand {

    public void execute(final ItemsService service, final IndexerOptions options) throws Exception {

        final DocumentCacheService cache = new DocumentCacheService(options.getDocCachePath());
        final TaxonGrab nameGrabber = new TaxonGrab();
        final File outputFile = new File(options.getOutputFile());

        if (outputFile.exists()) {
            outputFile.delete();
        }

        final Writer writer = new FileWriter(outputFile);

        cache.forEachItemPage(new CachedItemPageHandler() {

            private String _language = "";

            public void onPage(String internetArchiveId, String pageId, String text) {

                try {
                    LanguageScore score = WordLists.detectLanguage(text, _language);
                    String lang = _language;
                    if (score != null && !StringUtils.equalsIgnoreCase(score.getName(), _language)
                            && score.getScore() > .75) {
                        log("Page %s - %s language detected as %s (scored %g) - This conflicts with meta data language of %s",
                                internetArchiveId, pageId, score.getName(), score.getScore(), _language);
                        lang = score.getName();
                        if (score.getScore() == 1.0) {
                            System.err.println("Here");
                        }
                    }

                    List<String> names = nameGrabber.findNames(text, lang);
                    for (String name : names) {
                        String line = String.format("%s,%s,\"%s\"\n", internetArchiveId, pageId, name);
                        writer.write(line);
                    }
                } catch (IOException ioex) {
                    throw new RuntimeException(ioex);
                }

            }

            public void startItem(String internetArchiveId) {
                CacheControlBlock ccb = cache.getCacheControl(internetArchiveId);
                if (ccb != null) {
                    _language = ccb.Language;
                }
                log("Starting item %s (%s)", internetArchiveId, _language);
            }

            public void endItem(String itemId) {
                try {
                    writer.flush();
                } catch (Exception ex) {
                    ex.printStackTrace();
                }
                _language = "";
            }

        });
    }

    public void defineOptions(Options options) {
        options.addOption("o", true, "Output file for dumps, reports etc");
    }

}