org.jab.docsearch.spider.LinkFinder.java Source code

Introduction

Here is the source code for org.jab.docsearch.spider.LinkFinder.java
Source

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jab.docsearch.spider;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.security.SecureRandom;
import java.util.ArrayList;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.jab.docsearch.DocSearch;
import org.jab.docsearch.DocSearcherIndex;
import org.jab.docsearch.FileEnvironment;
import org.jab.docsearch.utils.DateTimeUtils;
import org.jab.docsearch.utils.FileUtils;
import org.jab.docsearch.utils.I18n;
import org.jab.docsearch.utils.NetUtils;
import org.jab.docsearch.utils.Utils;

/**
 * Class LinkFinder
 *
 * FIXME NPE if ds is null, but this is only if one constructor called!
 *
 * @version $Id: LinkFinder.java 172 2012-09-14 15:24:32Z henschel $
 */
public final class LinkFinder {
    /**
     * Log4J logger
     */
    private final Logger logger = Logger.getLogger(getClass().getName());
    /**
     * FileEnvironment
     */
    private final FileEnvironment fEnv = FileEnvironment.getInstance();
    /**
     * NetUtils
     */
    private final NetUtils netUtils = new NetUtils();

    private final String USER_NAME = System.getProperty("user.name");
    private int numSkips; // currently not used
    private DocSearch ds = null;
    private IndexWriter iw;
    private int numDeletes;
    private int numChanges;
    private int numNew;
    private int numUnChanged;
    private int numMetaNoIdx;
    private int numFails;
    private long maxFileSizeToGet = 600000;
    private final String pageName;
    private String downloadFileDir = System.getProperty("java.io.tmpdir");
    private DocSearcherIndex dsi;
    private String baseUrlFolder = "";
    private String domainUrl = "";
    private String outFile;
    private String outBadFile;
    private ArrayList<SpiderUrl> links = new ArrayList<SpiderUrl>();
    private final String[] htmlTypes = { "/", ".htm", ".html", ".jhtml", ".shtm", ".shtml", ".asp", ".aspx", ".php",
            ".php3", ".php4", ".php5", ".jsp", ".cfm", ".cfml", ".do" };
    private final String[] nonHtmlTypes = { ".zip", ".jpg", ".bmp", ".gif", ".db", ".cat", ".wmf", ".tif", ".tiff",
            ".swf", ".ncd", ".pdd", ".png", ".ppt", ".jpeg", ".mdb", ".msg", ".mpp", ".log" };
    private final String[] bogusDirs = { "_vti_", "_private", "file:" };
    private int maxLinksToFind = 5000;

    /**
     * Constructor (only for test)
     *
     * @param pageName
     * @param outFile
     */
    public LinkFinder(final String pageName, final String outFile, final String outBadFile) {
        this.pageName = pageName;
        this.outFile = outFile;
        this.outBadFile = outBadFile;

        // init
        init();
    }

    /**
     * Constructor
     *
     * @param pageName
     * @param maxLinksToFind
     * @param ds
     * @param dsi
     * @param iw
     */
    public LinkFinder(final String pageName, final int maxLinksToFind, final DocSearch ds,
            final DocSearcherIndex dsi, final IndexWriter iw) {
        this.pageName = pageName;
        this.maxLinksToFind = maxLinksToFind;
        this.ds = ds;
        this.dsi = dsi;
        this.iw = iw;

        if (ds != null) {
            downloadFileDir = ds.tempDir;
            maxFileSizeToGet = ds.getMaxFileSize();
        }

        // init
        init();
    }

    /**
     * Constructor
     *
     * @param pageName
     * @param maxLinksToFind
     * @param ds
     * @param dsi
     * @param links
     */
    public LinkFinder(final String pageName, final int maxLinksToFind, final DocSearch ds,
            final DocSearcherIndex dsi, final ArrayList<SpiderUrl> links) {
        this.pageName = pageName;
        this.maxLinksToFind = maxLinksToFind;
        this.ds = ds;
        this.dsi = dsi;
        this.links = links;

        if (ds != null) {
            downloadFileDir = ds.tempDir;
            maxFileSizeToGet = ds.getMaxFileSize();
        }

        // init
        init();
    }

    /**
     * Method init
     */
    private void init() {
        // Create a trust manager that does not validate certificate chains
        TrustManager[] trustAllCerts = new TrustManager[] { new X509TrustManager() {
            @Override
            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                return null;
            }

            @Override
            public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) {
                // nothing
            }

            @Override
            public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) {
                // nothing
            }
        } };

        // Install the all-trusting trust manager
        try {
            SSLContext sc = SSLContext.getInstance("SSL");
            sc.init(null, trustAllCerts, new SecureRandom());
            HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
        } catch (Exception e) {
            logger.error("init() failed", e);
        }
    }

    /**
     * Checks URL is a HTML file.
     *
     * @param url  url string
     * @return     true is HTML
     */
    private boolean isHtml(final String url) {
        String lowerUrl = url.toLowerCase();

        // if dynamic, than html
        // TODO dynamic url isn't always a html
        if (url.indexOf("?") != -1) {
            return true;
        }

        // check file suffix
        for (String tmp : htmlTypes) {
            if (lowerUrl.endsWith(tmp)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Checks page is skip type
     *
     * @param url  url string
     * @return     true is skip type
     */
    private boolean skipType(final String url) {
        String lowerUrl = url.toLowerCase();

        // check file suffix
        for (String tmp : nonHtmlTypes) {
            if (lowerUrl.endsWith(tmp)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Checks for bogus directory
     *
     * @param url  url string
     * @return     true is bogus directory
     */
    private boolean hasBogusDirs(final String url) {
        String lowerUrl = url.toLowerCase();

        // check file suffix
        for (String tmp : bogusDirs) {
            if (lowerUrl.indexOf(tmp) != -1) {
                return true;
            }
        }

        return false;
    }

    /**
     * Gets the next URL number
     *
     * @return  next URL number or -1 if no next URL available
     */
    private int getNextUrlNo() {
        int curPos = 0;

        for (SpiderUrl spy : links) {
            if (!spy.getIsSpidered() && !spy.getIsDeadLink()) {
                return curPos;
            }
            curPos++;
        }

        return -1;
    }

    /**
     * Gets link name by number
     *
     * @param linkNumber  link number in list
     * @return            link name
     */
    private String getLinkNameByNo(final int linkNumber) {
        SpiderUrl spy = links.get(linkNumber);

        return spy.getUrl();
    }

    /**
     * Gets link URL by number
     *
     * @param linkNumber  link number in list
     * @return            URL
     */
    private SpiderUrl getSpiderUrl(final int linkNumber) {
        SpiderUrl spy = links.get(linkNumber);

        return spy;
    }

    /**
     * Get all links from page
     */
    public void getAllLinks() {
        // writes links from a page out to a file
        String urlStr = pageName;
        String shortUrl = "";
        numUnChanged = 0;
        numSkips = 0;
        int numSuccesses = 0;
        int numFailed = 0;
        int numNoRobots = 0;
        addLink(urlStr);
        domainUrl = Utils.getDomainURL(urlStr);
        if (logger.isDebugEnabled()) {
            logger.debug("getAllLinks() domain url='" + domainUrl + "'");
        }
        SpiderUrl curl = new SpiderUrl(urlStr);
        baseUrlFolder = Utils.getBaseURLFolder(urlStr);
        int curLinkNo = 0;
        boolean completedSpider = false;
        boolean isDead = false;
        int curPread = 0;
        if (ds != null) {
            ds.setIsWorking(true);
            ds.setProgressMax(maxLinksToFind);
            ds.setCurProgressMSG("Spidering Files...");
        }
        int numSpidered = 0;
        int curSuccessNo = 0;

        // start spider
        while (curLinkNo != -1) {
            BufferedInputStream urlStream = null;
            FileOutputStream fileOutStream = null;

            try {
                completedSpider = false;
                isDead = false;
                if (ds != null) {
                    ds.setCurProgress(curPread);
                    if (!ds.getIsWorking()) {
                        break;
                    }
                }
                curLinkNo = getNextUrlNo();
                if (curLinkNo == -1) {
                    logger.debug("getAllLinks() end of links reached.");
                    break;
                } else {
                    urlStr = getLinkNameByNo(curLinkNo);
                    logger.info("getAllLinks() analyzing page='" + urlStr + "'");
                    curl = getSpiderUrl(curLinkNo);
                }

                shortUrl = Utils.concatEnd(urlStr, 33);
                setStatus(I18n.getString("connecting_to") + " " + shortUrl);

                // open url
                URL url = new URL(urlStr);
                URLConnection conn = url.openConnection();
                conn.setDoInput(true);
                conn.setUseCaches(false);
                conn.setRequestProperty("User-Agent", "DocSearcher " + I18n.getString("ds.version"));
                conn.connect();
                urlStream = new BufferedInputStream(conn.getInputStream());

                // filesize
                int fileSize = conn.getContentLength();
                if (fileSize > maxFileSizeToGet) {
                    String ex = I18n.getString("skipping_file_too_big") + " (" + fileSize + " > " + maxFileSizeToGet
                            + ") " + shortUrl;
                    setStatus(ex);
                    throw new Exception(ex);
                }

                setStatus(I18n.getString("downloading_uc") + "... " + shortUrl + " " + fileSize + " "
                        + I18n.getString("bytes"));
                curl.setSize(fileSize);

                // last modified
                long curModified = conn.getLastModified(); // was .getDate();
                curl.setLastModified(curModified);

                // content type
                String curContentType = netUtils.getContentType(conn);
                curl.setContentType(curContentType);

                // build the value for downloadFile
                String dnldTmpName = getDownloadFileName(curl.getContentType(), urlStr.toLowerCase());
                String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName);

                // TODO it is better to use content type!
                boolean curIsWebPage = isHtml(urlStr.toLowerCase())
                        || (curContentType.toLowerCase().indexOf("html") != -1);

                logger.debug("getAllLinks() saving to " + downloadFile);
                fileOutStream = new FileOutputStream(downloadFile);
                int curSize = 0;
                int curI;
                int lastPercent = 0;
                StringBuilder tag = new StringBuilder();
                String link = null;
                boolean inTag = false;
                boolean getFileSizeFromStream = false;
                if (fileSize == -1) {
                    getFileSizeFromStream = true;
                }

                while ((curI = urlStream.read()) != -1) {
                    fileOutStream.write(curI);

                    curSize++;
                    if (ds != null) {
                        if (!ds.getIsWorking()) {
                            break;
                        }
                    }

                    // fix problem if filesize not in content length
                    if (getFileSizeFromStream) {
                        fileSize = curSize + urlStream.available();
                    }

                    // notify of download progress
                    if (curSize > 0 && (curSize % 10) == 0) {
                        int curPercent = (curSize * 100) / fileSize;
                        if (curPercent != lastPercent) {
                            lastPercent = curPercent;
                            setStatus(I18n.getString("downloading_uc") + "... : (" + shortUrl + ") --> "
                                    + curPercent + " %" + " ( " + (numSuccesses + numFailed + numNoRobots) + "/"
                                    + getNumLinksFound() + ")");
                        }
                    } // end for percent updates
                    else if (curSize % 40 == 0) {
                        setStatus(I18n.getString("downloading_uc") + "... : (" + shortUrl + ") --> " + curSize + " "
                                + I18n.getString("bytes"));
                    }

                    // handle links
                    if (curIsWebPage) {
                        char c = (char) curI;
                        // LOOK AT THE TAGS

                        // start tag
                        if (c == '<') {
                            inTag = true;
                            tag = new StringBuilder();
                        }
                        // end tag
                        else if (c == '>') {
                            inTag = false;
                            tag.append(c);
                            String realTag = tag.toString();
                            String lowerTag = realTag.toLowerCase();

                            // TODO fix problem with spaces before =

                            // link
                            if (lowerTag.startsWith("<a ")) {
                                link = Utils.getTagString("href=", realTag);
                                link = Utils.getNormalUrl(link);
                                doPossibleAdd(urlStr, link);
                            }
                            // area
                            else if (lowerTag.startsWith("<area")) {
                                link = Utils.getTagString("href=", realTag);
                                link = Utils.getNormalUrl(link);
                                doPossibleAdd(urlStr, link);
                            }
                            // TODO is in param realy a link?
                            else if (lowerTag.startsWith("<param")) {
                                String appletParam = Utils.getTagString("name=", realTag);
                                if (appletParam.toLowerCase().equals("url")) {
                                    link = Utils.getTagString("value=", realTag);
                                    link = Utils.getNormalUrl(link);
                                    doPossibleAdd(urlStr, link);
                                }
                            }
                        }

                        // in tag
                        if (inTag) {
                            tag.append(c);
                        }
                    }

                    // filesize ok
                    if (getFileSizeFromStream && fileSize > maxFileSizeToGet) {
                        break;
                    }
                } // end while downloading
                curPread++;
                fileOutStream.close();
                urlStream.close();
                curl.setMd5(FileUtils.getMD5Sum(downloadFile));

                // now add out document
                if (ds != null) {
                    curSuccessNo = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curl);
                    switch (curSuccessNo) {
                    case 0: // good
                        numSuccesses++;
                        break;
                    case 1: // bad
                        numFailed++;
                        break;
                    case 2: // meta robots - no index
                        numNoRobots++;
                        break;
                    }
                }

                // delete temp file
                if (!FileUtils.deleteFile(downloadFile)) {
                    logger.warn("getAllLinks() can't delete file '" + downloadFile + "'");
                }

                numSpidered++;
                completedSpider = true;

                // max links found
                if (numSpidered > maxLinksToFind) {
                    break;
                }
            } catch (Exception e) {
                logger.fatal("getAllLinks() failed", e);
                setStatus(I18n.getString("error") + " : " + e.toString());
                isDead = true;
            } finally {
                // close resources
                IOUtils.closeQuietly(urlStream);
                IOUtils.closeQuietly(fileOutStream);

                curl.setSpidered(completedSpider);
                curl.setIsDeadLink(isDead);
                setStatus(I18n.getString("download_complete") + " " + shortUrl);
            }
        } // end for iterating over links

        if (ds != null) {
            ds.resetProgress();
        }
        saveAllLinks();

        logger.info("getAllLinks() " + numSpidered + " total web pages spidered for links.");

        showMessage(I18n.getString("spidering_complete") + " (" + Utils.concatStrToEnd(pageName, 28) + ") ",
                numSpidered + " " + I18n.getString("documents_indexed") + " " + getNumLinksFound() + " "
                        + I18n.getString("links_found") + "\n\n" + numSuccesses + " "
                        + I18n.getString("documents_spidered_successful") + "\n\n" + numFailed + " "
                        + I18n.getString("documents_spidered_failed") + "\n\n" + numNoRobots + " "
                        + I18n.getString("documents_not_spidered"));
    }

    /**
     * Has base url
     *
     * @param toCheck
     * @return
     */
    private boolean hasBaseUrl(final String toCheck) {
        boolean retB = false;
        if (baseUrlFolder.equals("")) {
            retB = true;
            logger.info("hasBaseUrl() no base url!");
        } else {
            String lowerCheck = toCheck.toLowerCase();
            String lowerBase = baseUrlFolder.toLowerCase();
            if (lowerCheck.startsWith(lowerBase))
                retB = true;
        }
        return retB;
    }

    /**
     * Sets status
     *
     * @param toSet
     */
    private void setStatus(final String toSet) {
        if (ds != null) {
            ds.setStatus(toSet);
        }
    }

    /**
     * Method doPossibleAdd
     *
     * @param page
     * @param link
     */
    private void doPossibleAdd(final String page, final String link) {
        if (logger.isDebugEnabled()) {
            logger.debug("doPossibleAdd('" + page + "', '" + link + "') entered");
        }

        String tLink = link.trim();

        if (!tLink.equals("") && tLink.indexOf("mailto:") == -1) {
            // remove anchor
            if (tLink.indexOf("#") != -1) {
                int anchorPos = tLink.indexOf("#");
                tLink = tLink.substring(0, anchorPos);
            }
            // replace \\
            if (tLink.indexOf("\\") != -1) {
                tLink = Utils.replaceAll("\\", tLink, "/");
            }
            // use string in ''
            if (tLink.startsWith("'") && tLink.endsWith("'")) {
                tLink = tLink.substring(1, tLink.length() - 1);
            }

            LinkValue lv = new LinkValue(page, tLink);
            String realUrl = lv.getRealLink();

            //
            if (realUrl.toLowerCase().startsWith(domainUrl.toLowerCase()) && !hasBogusDirs(realUrl)
                    && hasBaseUrl(realUrl) && !skipType(realUrl)) {
                addLink(realUrl);
            } // end for url has domain prefix
            else {
                if (logger.isDebugEnabled()) {
                    logger.debug(
                            "doPossibleAdd() real url='" + realUrl + "' skipped because it doesn't start with '"
                                    + domainUrl + "', or url is not an indexible file type.");
                }
            }
        } else {
            if (logger.isDebugEnabled()) {
                logger.debug("doPossibleAdd() link='" + link + "' was a mailto or empty.");
            }
        }
    }

    /**
     * Adds link to list
     *
     * @param newUrl
     */
    private void addLink(final String newUrl) {
        if (logger.isDebugEnabled()) {
            logger.debug("addLink() try to add link='" + newUrl + "'");
        }

        if (StringUtils.isBlank(newUrl)) {
            logger.warn("addLink() url is empty");
            return;
        }

        int newLen = newUrl.length();
        int curPos = 0;
        boolean okToAdd = true;

        // search in list
        for (SpiderUrl spy : links) {
            String curUrlString = spy.getUrl();
            int curLen = curUrlString.length();

            // more speed,
            // check length, because links are length sorted (desc)
            if (newLen == curLen) {
                if (newUrl.equalsIgnoreCase(curUrlString)) {
                    okToAdd = false;
                }
            } else if (newLen > curLen) {
                break;
            }
            curPos++;
        }

        if (okToAdd) {
            SpiderUrl surl = new SpiderUrl(Utils.replaceAll("|", newUrl, "%7C"));
            links.add(curPos, surl);
        } else {
            if (logger.isDebugEnabled()) {
                logger.debug("addLink() skipping double link='" + newUrl + "'");
            }
        }
    }

    /**
     * getNumLinksFound
     *
     * @return
     */
    private int getNumLinksFound() {
        return links.size();
    }

    /**
     * Saves all links
     */
    private void saveAllLinks() {
        logger.debug("saveAllLinks() entered");

        // now save the output to a file
        if (dsi != null) {
            outFile = fEnv.getSpiderIndexURLFile(dsi.getName());
            outBadFile = fEnv.getSpiderBadIndexURLFile(dsi.getName());
        }

        File saveFile = new File(outFile);
        File saveBadFile = new File(outBadFile);

        PrintWriter pw = null;
        PrintWriter bpw = null;
        try {
            pw = new PrintWriter(new FileWriter(saveFile));
            bpw = new PrintWriter(new FileWriter(saveBadFile));

            // save every link
            for (SpiderUrl spy : links) {
                if (!spy.getIsDeadLink()) {
                    pw.println(spy.getUrl() + '|' + spy.getLastModified() + '|' + spy.getSize() + '|'
                            + spy.getContentType() + '|' + spy.getMd5());
                } else {
                    bpw.println(spy.getUrl());
                }
            }
            setStatus(links.size() + " total links found, " + numSkips + " links skipped.");
        } catch (IOException ioe) {
            logger.fatal("saveAllLinks() failed", ioe);
        } finally {
            if (pw != null) {
                pw.close();
            }
            if (bpw != null) {
                bpw.close();
            }
        }
    }

    /**
     * Shows message
     *
     * @param title
     * @param details
     */
    private void showMessage(final String title, final String details) {
        if (ds != null) {
            ds.showMessage(title, details);
        } else {
            logger.info("showMessage() " + title + "\n" + details);
        }
    }

    /**
     * Checks file for links
     *
     * @param origFile
     * @param thisPageName
     */
    private void checkFileForLinks(final String origFile, final String thisPageName) {
        String urlStr = thisPageName;
        String shortUrl = Utils.concatEnd(thisPageName, 33);
        setStatus(DocSearch.dsLkngFoLnx + " " + shortUrl);
        domainUrl = Utils.getDomainURL(urlStr);
        String realTag;
        String lowerTag;
        boolean inTag = false;
        String link = null;
        long lastPcnt = 0;
        File testFi = new File(origFile);
        long thisFileSize = testFi.length();
        long curDnldnd = 0;
        StringBuilder tagBuf = new StringBuilder();
        baseUrlFolder = Utils.getBaseURLFolder(urlStr);

        Reader in = null;
        try {
            // open file
            in = new BufferedReader(new InputStreamReader(new FileInputStream(origFile)));

            int ch;
            while ((ch = in.read()) > -1) {
                char c = (char) ch;
                curDnldnd++;

                if (thisFileSize > 0) {
                    if (curDnldnd % 10 == 0) {
                        long curPcnt = (curDnldnd * 100) / thisFileSize;
                        if (curPcnt != lastPcnt) {
                            lastPcnt = curPcnt;
                            setStatus(DocSearch.dsLkngFoLnx + " " + shortUrl + " --> (" + curPcnt + " %)");
                        }
                    }
                }

                // start tags
                if (c == '<') {
                    inTag = true;
                    tagBuf = new StringBuilder();
                }
                // end tags
                else if (c == '>') {
                    tagBuf.append(c);
                    realTag = tagBuf.toString();
                    lowerTag = realTag.toLowerCase();

                    if (lowerTag.startsWith("</a")) {
                        doPossibleAdd(thisPageName, link);
                    } else if (lowerTag.startsWith("<area")) {
                        link = Utils.getTagString("href=", realTag);
                        link = Utils.getNormalUrl(link);

                        doPossibleAdd(thisPageName, link);
                    } else if (lowerTag.startsWith("<param")) {
                        String appletParam = Utils.getTagString("name=", realTag);
                        if (appletParam.toLowerCase().equals("url")) {
                            link = Utils.getTagString("value=", realTag);
                            link = Utils.getNormalUrl(link);

                            doPossibleAdd(thisPageName, link);
                        }
                    }
                    //else if (lowerTag.startsWith("<a href=")) {
                    else if (lowerTag.startsWith("<a ")) {
                        link = Utils.getTagString("href=", realTag);
                        link = Utils.getNormalUrl(link);
                    }
                    inTag = false;
                }
                //
                if (inTag) {
                    tagBuf.append(c);
                }
            }

            in.close();
        } catch (IOException ioe) {
            logger.fatal("checkFileForLinks() failed", ioe);
        } finally {
            IOUtils.closeQuietly(in);
        }
    }

    /**
     * Gets number of deletes
     *
     * @return
     */
    public int getNumDeletes() {
        return numDeletes;
    }

    /**
     * Gets number of updates
     *
     * @return
     */
    public int getNumUpdates() {
        return numChanges;
    }

    /**
     * Gets number of news
     * @return
     */
    public int getNumNew() {
        return numNew;
    }

    /**
     * Gets number of fails
     *
     * @return
     */
    public int getNumFails() {
        return numFails;
    }

    /**
     * Gets number of unchanges
     * @return
     */
    public int getNumUnchanged() {
        return numUnChanged;
    }

    /**
     * Gets number of meta no index
     *
     * @return
     */
    public int getNumMetaNoIdx() {
        return numMetaNoIdx;
    }

    /**
     * Method update
     *
     * @throws IOException
     */
    public void update() throws IOException {
        numDeletes = 0;
        numChanges = 0;
        numNew = 0;
        numFails = 0;
        numUnChanged = 0;
        numMetaNoIdx = 0;

        IndexReader ir = IndexReader.open(dsi.getIndexPath());
        int maxNumDocs = ir.maxDoc();
        int maxTotal = maxNumDocs + maxNumDocs / 10;
        int curDocNum = 0;
        if (ds != null) {
            ds.setStatus(DocSearch.dsTtlDxInIdx + " " + maxNumDocs);
            ds.setIsWorking(true);
            ds.setProgressMax(maxTotal * 2);
            ds.setCurProgressMSG("Spidering Files...");
        }

        // assign index location to urls currently in the index
        int lastFound = 0;
        for (SpiderUrl spy : links) {
            curDocNum++;

            if (ds != null) {
                ds.setCurProgress(curDocNum);
                if (!ds.getIsWorking()) {
                    break;
                }
            }

            String curFi = spy.getUrl();
            lastFound = ds.idx.spiderIndexNum(lastFound, curFi, ir);
            spy.setIndexLocation(lastFound);

            if (lastFound == -1) {
                logger.debug("update() " + curFi + " currently is not in the index");
            }
        }

        // now iterate over all the spider urls
        int curSpiderNum = getNextUrlNo();
        int totalSpidered = 0;
        while (curSpiderNum != -1) {
            curDocNum++;

            if (ds != null) {
                ds.setCurProgress(curDocNum);
                if (!ds.getIsWorking()) {
                    break;
                }
            }

            SpiderUrl curSpider = getSpiderUrl(curSpiderNum);
            int curNumLinksFound = getNumLinksFound();
            int curIdxNum = curSpider.getIndexLocation();
            // TODO is this getsize realy needed, than the url ist in index?
            long curUrlSize = netUtils.getURLSize(curSpider.getUrl());
            String shortUrl = Utils.concatEnd(curSpider.getUrl(), 33);
            String dnldTmpName = getDownloadFileName(curSpider.getContentType(), curSpider.getUrl().toLowerCase());
            String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName);

            // document is to big
            if (curUrlSize > maxFileSizeToGet) {
                logger.debug("update() '" + shortUrl + "' is to big");
                setStatus(I18n.getString("skipping_file_too_big") + " (" + curUrlSize + " > " + maxFileSizeToGet
                        + ") " + shortUrl);
                curSpider.setSize(curUrlSize);
            }
            // document is in index
            else if (curIdxNum != -1) {
                logger.debug("update() '" + shortUrl + "' is in index");
                setStatus(DocSearch.dsCkgFoUpdtsToDoc + " " + shortUrl + " (" + totalSpidered + " / "
                        + curNumLinksFound + ")");

                int curSpiderStatus = netUtils.getURLStatus(curSpider, downloadFile);
                switch (curSpiderStatus) {
                case -1: // broken url
                    logger.debug("update() '" + shortUrl + "' is broken");
                    setStatus(DocSearch.dsBknLink + " " + shortUrl);
                    curSpider.setIsDeadLink(true);
                    // remove from index
                    ir.deleteDocument(curIdxNum);
                    numDeletes++;
                    break;
                case 0: // same
                    logger.debug("update() '" + shortUrl + "' no changes");
                    setStatus(DocSearch.lnkNoChanges + " " + shortUrl);
                    numUnChanged++;
                    totalSpidered++;
                    break;
                case 1: // changed
                    logger.debug("update() '" + shortUrl + "' is changed");
                    setStatus(DocSearch.dsReIdxgLnk + " " + shortUrl);
                    ir.deleteDocument(curIdxNum);
                    ir.close();
                    iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                    // iw.setUseCompoundFile(true);
                    int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                    iw.close();
                    ir = IndexReader.open(dsi.getIndexPath());
                    if (curAddedSuccess == 0) {
                        numChanges++;
                        totalSpidered++;
                    } else if (curAddedSuccess == 2) {
                        numMetaNoIdx++;
                    } else if (curAddedSuccess == 1) {
                        logger.warn("update() indexing failed " + shortUrl);
                        numFails++;
                    }

                    // get links from downloaded file
                    if (isHtml(curSpider.getUrl())) {
                        checkFileForLinks(downloadFile, curSpider.getUrl());
                    }
                    break;
                }
            }
            // document is not in index
            else {
                logger.debug("update() '" + shortUrl + "' is not in index");
                setStatus(DocSearch.dsSpiderNewUrl + " " + shortUrl + " (" + totalSpidered + " / "
                        + curNumLinksFound + ")");

                boolean downloadOk = netUtils.downloadURLToFile(curSpider, downloadFile);
                if (downloadOk) {
                    iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                    // iw.setUseCompoundFile(true);
                    int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                    iw.close();
                    ir.close();
                    ir = IndexReader.open(dsi.getIndexPath());
                    if (curAddedSuccess == 0) {
                        numNew++;
                        totalSpidered++;
                    } else if (curAddedSuccess == 2) {
                        numMetaNoIdx++;
                    } else if (curAddedSuccess == 1) {
                        logger.warn("update() indexing failed " + shortUrl);
                        numFails++;
                    }
                    if (isHtml(curSpider.getUrl())) {
                        checkFileForLinks(downloadFile, curSpider.getUrl());
                    }
                } else {
                    setStatus(DocSearch.dsBknLink + " " + shortUrl);
                    curSpider.setIsDeadLink(true);
                }
            }

            // last things to do
            curSpider.setSpidered(true);
            curSpiderNum = getNextUrlNo();
            if (curSpiderNum == -1) {
                break;
            }
            if (totalSpidered > maxTotal) {
                break;
            }

            // delete temp file
            if (!FileUtils.deleteFile(downloadFile)) {
                logger.warn("update() can't delete file '" + downloadFile + "'");
            }
        }

        setStatus(DocSearch.dsSpdrUpdteComp + " " + dsi.getName());
        saveAllLinks();

        // update the date of the index
        dsi.setLastIndexed(DateTimeUtils.getToday());
        ir.close();
        ds.resetProgress();
    }

    /**
     * Gets a tempfilename
     *
     * @param contentType  content type of object
     * @param url          URL
     * @return             temp filename
     */
    protected String getDownloadFileName(final String contentType, final String url) {
        if (logger.isDebugEnabled()) {
            logger.debug("getDownloadFileName('" + contentType + "', '" + url + "') entered");
        }

        StringBuilder result = new StringBuilder();
        result.append("temp_spidered_document_").append(USER_NAME);

        // first: content type
        if (contentType.toLowerCase().endsWith("html")) {
            result.append(".htm");
        }
        // second: extension
        else if (url.endsWith("/")) {
            result.append(".htm");
        } else {
            String extension = FileUtils.getFileExtension(url);
            if (extension.equals("unknown")) {
                result.append(".htm");
            } else {
                result.append('.').append(extension);
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug("getDownloadFileName() result='" + result + "'");
        }

        return result.toString();
    }
}