com.jaeksoft.searchlib.scheduler.task.TaskFtpXmlFeed.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.scheduler.task.TaskFtpXmlFeed.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.scheduler.task;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPathExpressionException;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPConnectionClosedException;
import org.apache.commons.net.ftp.FTPFile;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.file.process.fileInstances.FtpFileInstance;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.scheduler.TaskAbstract;
import com.jaeksoft.searchlib.scheduler.TaskLog;
import com.jaeksoft.searchlib.scheduler.TaskProperties;
import com.jaeksoft.searchlib.scheduler.TaskPropertyDef;
import com.jaeksoft.searchlib.scheduler.TaskPropertyType;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.Variables;

public class TaskFtpXmlFeed extends TaskAbstract {

    final private TaskPropertyDef propServer = new TaskPropertyDef(TaskPropertyType.textBox, "FTP server",
            "FTP server (hostname)", "The hostname of the FTP server", 100);

    final private TaskPropertyDef propPath = new TaskPropertyDef(TaskPropertyType.textBox, "Path", "Path",
            "The remote path", 100);

    final private TaskPropertyDef propLogin = new TaskPropertyDef(TaskPropertyType.textBox, "Login", "Login",
            "The username on the FTP server", 50);

    final private TaskPropertyDef propPassword = new TaskPropertyDef(TaskPropertyType.password, "Password",
            "Password", "The password on the FTP server", 50);

    final private TaskPropertyDef propFileNamePattern = new TaskPropertyDef(TaskPropertyType.textBox,
            "File name pattern", "File name pattern", "A regular expression to filter which files will be handled",
            50);

    final private TaskPropertyDef propXsl = new TaskPropertyDef(TaskPropertyType.multilineTextBox, "XSL", "XSL",
            "An option XSL stylesheet", 100, 30);

    final private TaskPropertyDef propDeleteAfterLoad = new TaskPropertyDef(TaskPropertyType.listBox,
            "Delete after load", "Delete after load", "Decide if the document will be deleted after being loaded",
            10);

    final private TaskPropertyDef propTruncateIndexWhenFilesFound = new TaskPropertyDef(TaskPropertyType.listBox,
            "Truncate index when files are found", "Truncate index when files are found",
            "Decide to truncate the index before loading the XML file", 10);

    final private TaskPropertyDef propBuffersize = new TaskPropertyDef(TaskPropertyType.textBox, "Buffer size",
            "Buffer size", "How many documents will be write to the index in each transaction", 10);

    final private TaskPropertyDef[] taskPropertyDefs = { propServer, propPath, propLogin, propPassword,
            propFileNamePattern, propXsl, propDeleteAfterLoad, propTruncateIndexWhenFilesFound, propBuffersize };

    @Override
    public String getName() {
        return "FTP XML feed ";
    }

    @Override
    public TaskPropertyDef[] getPropertyList() {
        return taskPropertyDefs;
    }

    @Override
    public String[] getPropertyValues(Config config, TaskPropertyDef propertyDef, TaskProperties taskProperties)
            throws SearchLibException {
        if (propertyDef == propDeleteAfterLoad)
            return ClassPropertyEnum.BOOLEAN_LIST;
        if (propertyDef == propTruncateIndexWhenFilesFound)
            return ClassPropertyEnum.BOOLEAN_LIST;
        return null;
    }

    @Override
    public String getDefaultValue(Config config, TaskPropertyDef propertyDef) {
        if (propertyDef == propPath)
            return "/";
        if (propertyDef == propBuffersize)
            return "50";
        if (propertyDef == propDeleteAfterLoad)
            return Boolean.FALSE.toString();
        if (propertyDef == propTruncateIndexWhenFilesFound)
            return Boolean.FALSE.toString();
        return null;
    }

    private void checkConnect(FTPClient ftp, String server, String login, String password) throws IOException {
        try {
            if (ftp.isConnected())
                if (ftp.sendNoOp())
                    return;
        } catch (FTPConnectionClosedException e) {
            Logging.warn(e);
        }
        ftp.setConnectTimeout(120000);
        ftp.setControlKeepAliveTimeout(180);
        ftp.setDataTimeout(120000);
        ftp.connect(server);
        ftp.login(login, password);
    }

    @Override
    public void execute(Client client, TaskProperties properties, Variables variables, TaskLog taskLog)
            throws SearchLibException {
        String server = properties.getValue(propServer);
        String path = properties.getValue(propPath);
        String login = properties.getValue(propLogin);
        String password = properties.getValue(propPassword);
        String fileNamePattern = properties.getValue(propFileNamePattern);
        boolean deleteAfterLoad = Boolean.TRUE.toString().equals(properties.getValue(propDeleteAfterLoad));
        boolean truncateWhenFilesFound = Boolean.TRUE.toString()
                .equals(properties.getValue(propTruncateIndexWhenFilesFound));
        Pattern pattern = null;
        if (fileNamePattern != null && fileNamePattern.length() > 0)
            pattern = Pattern.compile(fileNamePattern);

        String p = properties.getValue(propBuffersize);
        String xsl = properties.getValue(propXsl);
        File xmlTempResult = null;
        int bufferSize = 50;
        if (p != null && p.length() > 0)
            bufferSize = Integer.parseInt(p);
        HttpDownloader httpDownloader = client.getWebCrawlMaster().getNewHttpDownloader(true);
        FTPClient ftp = null;
        InputStream inputStream = null;
        try {
            // FTP Connection
            ftp = new FTPClient();
            checkConnect(ftp, server, login, password);
            FTPFile[] files = ftp.listFiles(path, new FtpFileInstance.FtpInstanceFileFilter(true, false, null));
            if (files == null)
                return;
            // Sort by ascendant filename
            String[] fileNames = new String[files.length];
            int i = 0;
            for (FTPFile file : files)
                fileNames[i++] = file.getName();
            Arrays.sort(fileNames);
            int ignored = 0;
            int loaded = 0;
            boolean bAlreadyTruncated = false;
            for (String fileName : fileNames) {
                String filePathName = FilenameUtils.concat(path, fileName);
                if (pattern != null)
                    if (!pattern.matcher(fileName).find()) {
                        ignored++;
                        continue;
                    }
                if (truncateWhenFilesFound && !bAlreadyTruncated) {
                    client.deleteAll();
                    bAlreadyTruncated = true;
                }
                taskLog.setInfo("Working on: " + filePathName);
                inputStream = ftp.retrieveFileStream(filePathName);
                Node xmlDoc = null;
                if (xsl != null && xsl.length() > 0) {
                    xmlTempResult = File.createTempFile("ossftpfeed", ".xml");
                    DomUtils.xslt(new StreamSource(inputStream), xsl, xmlTempResult);
                    xmlDoc = DomUtils.readXml(new StreamSource(xmlTempResult), false);
                } else
                    xmlDoc = DomUtils.readXml(new StreamSource(inputStream), false);
                client.updateXmlDocuments(xmlDoc, bufferSize, null, httpDownloader, taskLog);
                client.deleteXmlDocuments(xmlDoc, bufferSize, taskLog);
                inputStream.close();
                inputStream = null;
                if (!ftp.completePendingCommand())
                    throw new SearchLibException("FTP Error");
                if (xmlTempResult != null) {
                    xmlTempResult.delete();
                    xmlTempResult = null;
                }
                checkConnect(ftp, server, login, password);
                if (deleteAfterLoad)
                    ftp.deleteFile(filePathName);
                loaded++;
            }
            taskLog.setInfo(loaded + " file(s) loaded - " + ignored + " file(s) ignored");
        } catch (XPathExpressionException e) {
            throw new SearchLibException(e);
        } catch (NoSuchAlgorithmException e) {
            throw new SearchLibException(e);
        } catch (ParserConfigurationException e) {
            throw new SearchLibException(e);
        } catch (SAXException e) {
            throw new SearchLibException(e);
        } catch (IOException e) {
            throw new SearchLibException(e);
        } catch (URISyntaxException e) {
            throw new SearchLibException(e);
        } catch (InstantiationException e) {
            throw new SearchLibException(e);
        } catch (IllegalAccessException e) {
            throw new SearchLibException(e);
        } catch (ClassNotFoundException e) {
            throw new SearchLibException(e);
        } catch (TransformerException e) {
            throw new SearchLibException(e);
        } finally {
            if (xmlTempResult != null)
                xmlTempResult.delete();
            IOUtils.close(inputStream);
            try {
                if (ftp != null)
                    if (ftp.isConnected())
                        ftp.disconnect();
            } catch (IOException e) {
                Logging.warn(e);
            }
            if (httpDownloader != null)
                httpDownloader.release();
        }
    }
}