crawl.SphinxWrapper.java Source code

Introduction

Here is the source code for crawl.SphinxWrapper.java
Source

/*
 *  CrawlPR.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *  
 */
package crawl;

import gate.Corpus;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.corpora.DocumentImpl;
import gate.creole.ResourceInstantiationException;
import gate.persist.PersistenceException;

import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Date;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import javax.mail.internet.ContentType;
import javax.mail.internet.ParseException;

import org.apache.commons.lang.StringUtils;

import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;

public class SphinxWrapper extends Crawler {

    private static final long serialVersionUID = -6524027714398026402L;
    @SuppressWarnings("unused")
    private static final String __SVNID = "$Id: SphinxWrapper.java 17662 2014-03-14 16:19:05Z markagreenwood $";

    private Corpus corpus = null;
    private static int maxFetch = -1;
    private static int maxKeep = -1;
    private static AtomicInteger countFetched, countKept;
    private static boolean ignoreKeywords;
    private static boolean caseSensitiveKeywords;
    private static boolean convertXmlTypes;
    private static List<String> keywords;

    protected void setKeywords(List<String> newKeywords, boolean caseSensitive) {
        keywords = newKeywords;
        ignoreKeywords = (keywords == null) || keywords.isEmpty();
        caseSensitiveKeywords = caseSensitive;
    }

    protected void setConvertXmlTypes(boolean convert) {
        convertXmlTypes = convert;
    }

    @SuppressWarnings("unchecked")
    public void visit(Page p) {
        if (((maxFetch != -1) && (countFetched.get() >= maxFetch))
                || ((maxKeep != -1) && (countKept.get() >= maxKeep))) {
            syncIfNecessary();
            super.stop();
            return;
        }

        int currentFetched = countFetched.incrementAndGet();
        String urlString = p.toURL();
        int depth = p.getDepth();
        Document doc = makeDocument(p);
        p.discardContent();

        /* For the keyword-matching, we tried p.toText() but it doesn't
         * parse JavaScript as well as GATE's HTML parser.       */

        if (doc == null) {// failed to produce a valid gate.Document
            System.out
                    .println(countKept.toString() + " / " + currentFetched + " [" + depth + "] Drop: " + urlString);
        }

        else if (ignoreKeywords || containsAnyKeyword(doc, keywords, caseSensitiveKeywords)) {
            // produced a valid gate.Document
            // keyword match succeeded
            corpus.add(doc);
            int currentCount = countKept.incrementAndGet();

            if (corpus.getLRPersistenceId() != null) {
                corpus.unloadDocument(doc);
                Factory.deleteResource(doc);
            }
            System.out.println(currentCount + " / " + currentFetched + " [" + depth + "] Keep: " + urlString);
        }

        else { // keyword match failed
            System.out
                    .println(countKept.toString() + " / " + currentFetched + " [" + depth + "] Drop: " + urlString);
            Factory.deleteResource(doc);
        }
    }

    public boolean shouldVisit(Link l) {
        return super.shouldVisit(l);
    }

    protected void setDepth(int depth) {
        super.setMaxDepth(depth);
    }

    protected void setMaxPages(int max) {
        maxFetch = max;
    }

    protected void setMaxKeep(int max) {
        maxKeep = max;
    }

    protected int getMaxPages() {
        return maxFetch;
    }

    protected int getMaxKeep() {
        return maxKeep;
    }

    protected void addStartLink(String root) {
        try {
            URL url = new URL(root);
            Link link = new Link(url);
            System.out.println("Adding seed URL  " + url.toString());
            super.addRoot(link);
        } catch (MalformedURLException me) {
            System.err.println("Malformed url " + root);
            me.printStackTrace();
        }
    }

    protected void addStartLink(URL url) {
        Link link = new Link(url);
        System.out.println("Adding seed URL  " + url.toString());
        super.addRoot(link);
    }

    public void setCorpus(Corpus corpus) {
        this.corpus = corpus;
    }

    /* yes: application/rss+xml.xml
     * no:  image/svg+xml.xml
     */
    private static String convertMimeType(String originalType) {
        String result = originalType;
        if (originalType.endsWith("xml")
                && (originalType.startsWith("application") || originalType.startsWith("application"))) {
            result = "text/xml";
        }
        return result;
    }

    public void start() {
        super.run();
    }

    protected void resetCounter() {
        countFetched = new AtomicInteger(0);
        countKept = new AtomicInteger(0);
    }

    protected void interrupt() {
        super.stop();
        syncIfNecessary();
    }

    private void syncIfNecessary() {
        if (corpus.getLRPersistenceId() != null) {
            try {
                corpus.sync();
            } catch (PersistenceException e) {
                e.printStackTrace();
            } catch (SecurityException e) {
                e.printStackTrace();
            }
        }
    }

    private static boolean containsAnyKeyword(Document document, List<String> keywords, boolean caseSensitive) {
        return containsAnyKeyword(document.getContent().toString(), keywords, caseSensitive);
    }

    private static boolean containsAnyKeyword(String content, List<String> keywords, boolean caseSensitive) {
        if ((keywords == null) || keywords.isEmpty()) {
            return true;
        }

        // implied else: test the keywords
        if (caseSensitive) {
            for (String kw : keywords) {
                if (StringUtils.contains(content, kw)) {
                    return true;
                }
            }
        }

        else { // case-insensitive
            for (String kw : keywords) {
                if (StringUtils.containsIgnoreCase(content, kw)) {
                    return true;
                }
            }
        }

        return false;
    }

    private static Document makeDocument(Page page) {
        String url = page.toURL();
        FeatureMap params = Factory.newFeatureMap();

        Document doc = null;

        String docName = shortenUrl(url).replaceAll("[^\\p{ASCII}]", "_") + "_" + Gate.genSym();

        /* Take advantage of the MIME type from the server when
         * constructing the GATE document.      */
        String contentTypeStr = page.getContentType();
        String originalMimeType = null;

        if (contentTypeStr != null) {
            try {
                ContentType contentType = new ContentType(contentTypeStr);
                String mimeType = contentType.getBaseType();
                String encoding = contentType.getParameter("charset");

                // get the content as bytes, and convert it to string using the correct
                // encoding (thanks to Christian Wartena for patch)
                byte[] bContent = page.getContentBytes();
                String sContent = new String(bContent, Charset.forName(encoding));
                params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, sContent);

                if (mimeType != null) {
                    if (convertXmlTypes) {
                        originalMimeType = mimeType;
                        mimeType = convertMimeType(mimeType);
                        if (!originalMimeType.equals(mimeType)) {
                            System.out.println("   convert " + originalMimeType + " -> " + mimeType);
                        }
                    }
                    params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                }

                if (encoding != null) {
                    params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);

                }
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }

        try {
            doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
            FeatureMap docFeatures = doc.getFeatures();

            Integer originalLength = page.getLength();
            docFeatures.put("originalLength", originalLength);

            /* Use the Last-Modified HTTP header if available.  */
            long lastModified = page.getLastModified();
            Date date;
            if (lastModified > 0L) {
                date = new Date(lastModified);
            } else {
                date = new Date();
            }
            docFeatures.put("Date", date);

            if (originalMimeType != null) {
                docFeatures.put("originalMimeType", originalMimeType);
            }

            doc.setSourceUrl(page.getURL());
            docFeatures.put("gate.SourceURL", url);
        } catch (ResourceInstantiationException e) {
            System.err.println("WARNING: could not intantiate document " + docName);
            e.printStackTrace();
        }

        return doc;
    }

    private static String shortenUrl(String url) {
        String result = url.replaceAll("//+", "/");
        int s0 = StringUtils.lastIndexOf(url, '/');
        int s1 = StringUtils.lastIndexOf(url, '/', s0 - 1);
        if (s1 > 0) {
            result = url.substring(s1 + 1);
        }
        return result;
    }

}