com.bluedragon.search.index.crawl.CrawlFactory.java Source code

Introduction

Here is the source code for com.bluedragon.search.index.crawl.CrawlFactory.java
Source

/* 
 *  Copyright (C) 2000 - 2011 TagServlet Ltd
 *
 *  This file is part of Open BlueDragon (OpenBD) CFML Server Engine.
 *  
 *  OpenBD is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  Free Software Foundation,version 3.
 *  
 *  OpenBD is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *  
 *  You should have received a copy of the GNU General Public License
 *  along with OpenBD.  If not, see http://www.gnu.org/licenses/
 *  
 *  Additional permission under GNU GPL version 3 section 7
 *  
 *  If you modify this Program, or any covered work, by linking or combining 
 *  it with any of the JARS listed in the README.txt (or a modified version of 
 *  (that library), containing parts covered by the terms of that JAR, the 
 *  licensors of this Program grant you additional permission to convey the 
 *  resulting work. 
 *  README.txt @ http://www.openbluedragon.org/license/README.txt
 *  
 *  http://www.openbluedragon.org/
 *  
 *  $Id: CrawlFactory.java 2374 2013-06-10 22:14:24Z alan $
 */

package com.bluedragon.search.index.crawl;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.bluedragon.search.DocumentWrap;
import com.bluedragon.search.index.crawl.handler.AbstractFileHandler;
import com.bluedragon.search.index.crawl.handler.CrawlException;
import com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl;
import com.bluedragon.search.index.crawl.handler.FileHandlerJPGImpl;
import com.bluedragon.search.index.crawl.handler.FileHandlerMP3Impl;
import com.bluedragon.search.index.crawl.handler.FileHandlerMSOfficeImpl;
import com.bluedragon.search.index.crawl.handler.FileHandlerOpenOfficeImpl;
import com.bluedragon.search.index.crawl.handler.FileHandlerPDFImpl;
import com.bluedragon.search.index.crawl.handler.FileHandlerTextImpl;
import com.nary.util.string;

public class CrawlFactory extends Object {

    protected Map<String, AbstractFileHandler> extHandlers;

    public CrawlFactory(boolean bStoreBody) {
        extHandlers = new HashMap<String, AbstractFileHandler>();

        addHandler(new FileHandlerMSOfficeImpl(bStoreBody));
        addHandler(new FileHandlerOpenOfficeImpl(bStoreBody));
        addHandler(new FileHandlerTextImpl(bStoreBody));
        addHandler(new FileHandlerPDFImpl(bStoreBody));
        addHandler(new FileHandlerHTMLImpl(bStoreBody));
        addHandler(new FileHandlerMP3Impl(false));
        addHandler(new FileHandlerJPGImpl(false));
    }

    private void addHandler(AbstractFileHandler fH) {
        Iterator<String> it = fH.getExtensions().iterator();
        while (it.hasNext())
            extHandlers.put(it.next(), fH);

        it = fH.getMimeTypes().iterator();
        while (it.hasNext())
            extHandlers.put(it.next(), fH);
    }

    /**
     * Crawls the file given.  If the file cannot be handled then a null is returned 
     * 
     * @param urlroot
     * @param file
     * @return
     * @throws CrawlException
     */
    public DocumentWrap crawlFile(String urlroot, File file) {
        if (!file.exists() || !file.isFile())
            return null;

        String ext = org.apache.commons.io.FilenameUtils.getExtension(file.getName().toLowerCase());

        if (extHandlers.containsKey(ext)) {
            try {
                return extHandlers.get(ext).crawl(urlroot, file);
            } catch (CrawlException e) {
            }
        }

        return null;
    }

    /**
     * Gets the list of files to crawl
     * 
     * @param dir
     * @param exts
     * @param bRecurse
     * @return
     * @throws IOException
     */
    public Set<String> getFilesToCrawl(File dir, String exts, boolean bRecurse) throws IOException {
        ConfigurableFileFilter filter = new ConfigurableFileFilter(getExtensions(exts), bRecurse);
        return recursePath(dir, filter);
    }

    private Set<String> recursePath(File dir, FileFilter filter) throws IOException {
        Set<String> set = new HashSet<String>();
        File[] files = dir.listFiles(filter);

        for (int i = 0; i < files.length; i++) {
            if (files[i].isDirectory())
                set.addAll(recursePath(files[i], filter));
            else
                set.add(files[i].getCanonicalPath());
        }

        return set;
    }

    private Set<String> getExtensions(String exts) {
        Set<String> extensions = new HashSet<String>();

        if (exts != null) {
            List<String> tokens = string.split(exts, " ,:;");
            String token = null;
            for (int i = 0; i < tokens.size(); i++) {
                token = tokens.get(i).toLowerCase();
                if (token.indexOf(".") == -1)
                    token = "." + token;
                extensions.add(token);
            }
        } else {
            extensions.add(".htm");
            extensions.add(".html");
            extensions.add(".cfm");
            extensions.add(".cfml");
            extensions.add(".dbm");
            extensions.add(".dbml");
        }
        return extensions;
    }

    public void close() {
    }
}