com.gisgraphy.importer.ImporterHelper.java Source code

Java tutorial

Introduction

Here is the source code for com.gisgraphy.importer.ImporterHelper.java

Source

/*******************************************************************************
 *   Gisgraphy Project 
 * 
 *   This library is free software; you can redistribute it and/or
 *   modify it under the terms of the GNU Lesser General Public
 *   License as published by the Free Software Foundation; either
 *   version 2.1 of the License, or (at your option) any later version.
 * 
 *   This library is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 *   Lesser General Public License for more details.
 * 
 *   You should have received a copy of the GNU Lesser General Public
 *   License along with this library; if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
 * 
 *  Copyright 2008  Gisgraphy project 
 *  David Masclet <davidmasclet@gisgraphy.com>
 *  
 *  
 *******************************************************************************/
/**
 *
 */
package com.gisgraphy.importer;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Enumeration;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.gisgraphy.domain.geoloc.entity.Adm;
import com.gisgraphy.helper.FeatureClassCodeHelper;

/**
 * Useful methods for importer
 * @author <a href="mailto:david.masclet@gisgraphy.com">David Masclet</a> 
 */
public class ImporterHelper {

    /**
     * The readme filename (it must not be processed)
     */
    public static final String EXCLUDED_README_FILENAME = "readme.txt";
    /**
     * the all country dump file name
     */
    public static final String ALLCOUTRY_FILENAME = "allCountries.txt";
    /**
     * The regexp that every country file dump matches
     */
    public static final String GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING = "[A-Z][A-Z](.txt)";

    public static final String OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING = "(US.)[0-9]+(.txt)";

    public static final String QUATTROSHAPES_FILE_ACCEPT_REGEX_STRING = "(localities.txt)";

    public static final String SPLITED_FILE_ACCEPT_REGEX_STRING = "[A-Z][A-Z](.)[0-9]+(.txt)";

    //2 letter but not us, it is managed by SPLITED_OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING
    public static final String SPLITED_OPENSTREETMAP_FILE_ACCEPT_REGEX_STRING = "((?!(?:US))[A-Z][A-Z])(.)[0-9]+(.txt)";

    public static final String SPLITED_GEONAMES_ALTERNATENAMES_FILE_ACCEPT_REGEX_STRING = "(US.)[0-9]+(.txt)";

    public static final String SPLITED_ALLCOUNTRIES_FILE_ACCEPT_REGEX_STRING = "(allCountries)(.)[0-9]+(.txt)";

    /**
     * The regexp that every zipped country file dump matches
     */
    public static final String ZIP_FILE_ACCEPT_REGEX_STRING = ".*(.zip)";

    public static final String TAR_BZ2_FILE_ACCEPT_REGEX_STRING = ".*(.tar.bz2)";

    protected static final Logger logger = LoggerFactory.getLogger(ImporterHelper.class);

    private static HttpClientParams params = new HttpClientParams() {
        {
            setConnectionManagerTimeout(2000);
            setSoTimeout(2000);
        }
    };
    private static MultiThreadedHttpConnectionManager connectionManager = new MultiThreadedHttpConnectionManager();
    private static HttpClient client = new HttpClient(connectionManager) {
        {
            setParams(params);
        }
    };

    public static FileFilter countryFileFilter = new FileFilter() {
        public boolean accept(File file) {
            Pattern patternGeonames = Pattern.compile(GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING);
            Pattern patternOpenStreetMapUS = Pattern.compile(OPENSTREETMAP_US_FILE_ACCEPT_REGEX_STRING);
            Pattern patternQuattroshapes = Pattern.compile(QUATTROSHAPES_FILE_ACCEPT_REGEX_STRING);

            return (file.isFile() && file.exists()) && !EXCLUDED_README_FILENAME.equals(file.getName())
                    && (patternGeonames.matcher(file.getName()).matches()
                            || ALLCOUTRY_FILENAME.equals(file.getName())
                            || patternOpenStreetMapUS.matcher(file.getName()).matches()
                            || patternQuattroshapes.matcher(file.getName()).matches());
        }
    };

    public static FileFilter splitedFileFilter = new FileFilter() {
        public boolean accept(File file) {
            Pattern patternSplit = Pattern.compile(SPLITED_FILE_ACCEPT_REGEX_STRING);
            Pattern patternAllCountriesSplit = Pattern.compile(SPLITED_ALLCOUNTRIES_FILE_ACCEPT_REGEX_STRING);

            return (file.isFile() && file.exists()) && !EXCLUDED_README_FILENAME.equals(file.getName())
                    && (patternAllCountriesSplit.matcher(file.getName()).matches()
                            || patternSplit.matcher(file.getName()).matches());
        }
    };

    private static FileFilter ZipFileFilter = new FileFilter() {
        public boolean accept(File file) {
            Pattern pattern = Pattern.compile(ZIP_FILE_ACCEPT_REGEX_STRING);

            return (file.isFile() && file.exists()) && pattern.matcher(file.getName()).matches();
        }
    };

    private static FileFilter tarBZ2FileFilter = new FileFilter() {
        public boolean accept(File file) {
            Pattern pattern = Pattern.compile(TAR_BZ2_FILE_ACCEPT_REGEX_STRING);

            return (file.isFile() && file.exists()) && pattern.matcher(file.getName()).matches();
        }
    };

    /**
     * @param directoryPath
     *            The directory where files are
     * @see #GEONAMES_COUNTRY_FILE_ACCEPT_REGEX_STRING
     * @return the allcountries.txt (@see {@linkplain #ALLCOUTRY_FILENAME} file
     *         if present or the list of country file to Import or an empty
     *         array if there is no file
     */
    public static File[] listCountryFilesToImport(String directoryPath) {

        File dir = new File(directoryPath);

        File[] files = dir.listFiles(countryFileFilter);

        if (files == null) {
            return new File[0];
        }

        for (File file : files) {
            if (ALLCOUTRY_FILENAME.equals(file.getName())) {
                files = new File[1];
                files[0] = file;
                logger.info(ALLCOUTRY_FILENAME
                        + " is present. Only this file will be imported. all other country files will be ignore");
                break;
            }
        }

        if (files.length == 0) {
            logger.warn("there is no file to import in " + directoryPath);
        }

        // for Log purpose
        for (int i = 0; i < files.length; i++) {
            logger.info(files[i].getName() + " is an importable File");
        }
        logger.info(files.length + " files are importable files");

        return files;
    }

    /**
     * @param directoryPath
     *            The directory where splited files are
     * 
     */
    public static File[] listSplitedFilesToImport(String directoryPath) {

        File dir = new File(directoryPath);

        File[] files = dir.listFiles(splitedFileFilter);

        if (files == null) {
            return new File[0];
        }

        if (files.length == 0) {
            logger.warn("there is no file to import in " + directoryPath);
        }

        // for Log purpose
        for (int i = 0; i < files.length; i++) {
            logger.info(files[i].getName() + " is a Geonames splited importable File");
        }
        logger.info(files.length + " files are Geonames importable files");

        return files;
    }

    /**
     * @param directoryPath
     *            The directory where Geonames files are to be downloaded in
     *            order to be processed
     * @see #ZIP_FILE_ACCEPT_REGEX_STRING
     * @return all the zip files present in the specified directory or an empty
     *         array if there is no file
     */
    public static File[] listZipFiles(String directoryPath) {

        File dir = new File(directoryPath);

        File[] files = dir.listFiles(ZipFileFilter);
        return files == null ? new File[0] : files;
    }

    /**
     * @param directoryPath
     *            The directory where openstreetmap files are to be downloaded
     *            in order to be processed
     * @see #TAR_BZ2_FILE_ACCEPT_REGEX_STRING
     * @return all the zip files present in the specified directory or an empty
     *         array if there is no file
     */
    public static File[] listTarFiles(String directoryPath) {

        File dir = new File(directoryPath);

        File[] files = dir.listFiles(tarBZ2FileFilter);
        return files == null ? new File[0] : files;
    }

    /**
     * @param URL the HTTP URL
     * @return The size of the HTTP file using HTTP head method 
     * or -1 if error or the file doesn't exists
     */
    public static long getHttpFileSize(String URL) {
        HeadMethod headMethod = new HeadMethod(URL);
        //we can not follow redirect because Geonames send a 302 found HTTP status code when a file doen't exists
        headMethod.setFollowRedirects(false);
        try {
            int code = client.executeMethod(headMethod);
            int firstDigitOfCode = code / 100;
            switch (firstDigitOfCode) {
            case 4:
                logger.error("Can not determine HTTP file size of " + URL + " because it does not exists (" + code
                        + ")");
                return -1;
            //needed to catch 3XX code because Geonames send a 302 found HTTP status code when a file doen't exists
            case 3:
                logger.error("Can not determine HTTP file size of " + URL + " because it does not exists (" + code
                        + ")");
                return -1;
            case 5:
                logger.error(
                        "Can not determine HTTP file size of " + URL + " because the server send an error " + code);
                return -1;

            default:
                break;
            }
            Header[] contentLengthHeaders = headMethod.getResponseHeaders("Content-Length");
            if (contentLengthHeaders.length == 1) {
                logger.info("HTTP file size of " + URL + " = " + contentLengthHeaders[0].getValue());
                return new Long(contentLengthHeaders[0].getValue());
            } else if (contentLengthHeaders.length <= 0) {
                return -1L;
            }
        } catch (HttpException e) {
            logger.error("can not execute head method for " + URL + " : " + e.getMessage(), e);
        } catch (IOException e) {
            logger.error("can not execute head method for " + URL + " : " + e.getMessage(), e);
        } finally {
            headMethod.releaseConnection();
        }
        return -1;
    }

    /**
     * @param urlsAsString
     * @return true if ALL the url doesn't retrun 200 or 3XX code 
     * and are valids
     */
    public static boolean checkUrls(List<String> urlsAsString) {
        if (urlsAsString == null) {
            return false;
        }
        for (String url : urlsAsString) {
            if (!checkUrl(url)) {
                return false;
            }
        }
        return true;
    }

    /**
     * check if an url doesn't return 200 or 3XX code
     * @param urlAsString the url to check
     * @return true if the url exists and is valid
     */
    public static boolean checkUrl(String urlAsString) {
        if (urlAsString == null) {
            logger.error("can not check null URL");
            return false;
        }
        URL url;
        try {
            url = new URL(urlAsString);
        } catch (MalformedURLException e) {
            logger.error(urlAsString + " is not a valid url, can not check.");
            return false;
        }
        int responseCode;
        String responseMessage = "NO RESPONSE MESSAGE";
        Object content = "NO CONTENT";
        HttpURLConnection huc;
        try {
            huc = (HttpURLConnection) url.openConnection();
            huc.setRequestMethod("HEAD");
            responseCode = huc.getResponseCode();
            content = huc.getContent();
            responseMessage = huc.getResponseMessage();
        } catch (ProtocolException e) {
            logger.error("can not check url " + e.getMessage(), e);
            return false;
        } catch (IOException e) {
            logger.error("can not check url " + e.getMessage(), e);
            return false;
        }

        if (responseCode == 200 || (responseCode > 300 && responseCode < 400)) {
            logger.info("URL " + urlAsString + " exists");
            return true;
        } else {
            logger.error(urlAsString + " return a " + responseCode + " : " + content + "/" + responseMessage);
            return false;
        }
    }

    /**
     * @param address
     *            the address of the file to be downloaded
     * @param localFileName
     *            the local file name (with absolute path)
     */
    public static void download(String address, String localFileName) throws FileNotFoundException {
        logger.info("download file " + address + " to " + localFileName);
        OutputStream out = null;
        HttpURLConnection conn = null;
        InputStream in = null;
        try {
            URL url = new URL(address);
            conn = (HttpURLConnection) url.openConnection();
            if (conn instanceof HttpURLConnection) {
                ((HttpURLConnection) conn).setInstanceFollowRedirects(false);

                int responseCode = ((HttpURLConnection) conn).getResponseCode();
                //manage most frequent error code and Gisgraphy specific one
                switch (responseCode) {
                case 509:
                    throw new RuntimeException("Sorry, there is too many users connected for " + address
                            + ", this site has limmited resources, please try again later");
                case 500:
                    throw new RuntimeException("Sorry, the server return an 500 status code for " + address
                            + ", an internal error has occured");
                case 404:
                    throw new FileNotFoundException("Sorry, the server return an 404 status code for " + address
                            + ", the file probably not exists or the URL is not correct");
                case 302:
                    throw new FileNotFoundException("Sorry, the server return an 302 status code for " + address
                            + ", the file is not at the correct URL");
                default:
                    break;
                }

            }
            in = conn.getInputStream();
            out = new BufferedOutputStream(new FileOutputStream(localFileName));
            byte[] buffer = new byte[1024];
            int numRead;
            long numWritten = 0;
            while ((numRead = in.read(buffer)) != -1) {
                out.write(buffer, 0, numRead);
                numWritten += numRead;
            }
            logger.info(localFileName + "\t" + numWritten);
        } catch (UnknownHostException e) {
            String errorMessage = "can not download " + address + " to " + localFileName + " : " + e.getMessage()
                    + ". if the host exists and is reachable,"
                    + " maybe this links can help : http://www.gisgraphy.com/forum/viewtopic.php?f=3&t=64 ";
            logger.warn(errorMessage);
            throw new ImporterException(errorMessage, e);
        } catch (FileNotFoundException e) {
            throw e;
        } catch (Exception e) {
            logger.warn("can not download " + address + " to " + localFileName + " : " + e.getMessage());
            throw new ImporterException(e);
        } finally {
            try {
                if (in != null) {
                    in.close();
                }
                if (out != null) {
                    out.flush();
                    out.close();
                }
            } catch (IOException ioe) {
                logger.error("cannot close streams");
            }
        }
    }

    /**
     * unzip a file in the same directory as the zipped file
     * 
     * @param file
     *            The file to unzip
     */
    public static void unzipFile(File file) {
        logger.info("will Extracting file: " + file.getName());
        Enumeration<? extends ZipEntry> entries;
        ZipFile zipFile;

        try {
            zipFile = new ZipFile(file);

            entries = zipFile.entries();

            while (entries.hasMoreElements()) {
                ZipEntry entry = (ZipEntry) entries.nextElement();

                if (entry.isDirectory()) {
                    // Assume directories are stored parents first then
                    // children.
                    (new File(entry.getName())).mkdir();
                    continue;
                }

                logger.info("Extracting file: " + entry.getName() + " to " + file.getParent() + File.separator
                        + entry.getName());
                copyInputStream(zipFile.getInputStream(entry), new BufferedOutputStream(
                        new FileOutputStream(file.getParent() + File.separator + entry.getName())));
            }

            zipFile.close();
        } catch (IOException e) {
            logger.error("can not unzip " + file.getName() + " : " + e.getMessage(), e);
            throw new ImporterException(e);
        }
    }

    private static final void copyInputStream(InputStream in, OutputStream out) throws IOException {
        byte[] buffer = new byte[1024];
        int len;

        while ((len = in.read(buffer)) >= 0) {
            out.write(buffer, 0, len);
        }

        in.close();
        out.close();
    }

    /**
     * @param fields
     *            the fields corresponding to a split line of the csv geonames file
     * @return the modified fields whith the feature code change to
     *         ADM1,ADM2,ADM3,ADM4 according to the ADMcodes. e.g id adm1code
     *         and Adm2 code are not null : the feature code will be change to
     *         ADM2.
     */
    public static String[] virtualizeADMD(String[] fields) {
        if (fields[7] != null && "ADMD".equals(fields[7]) && fields[6] != null && "A".equals(fields[6])) {
            // it is an ADMD, will try to detect level
            int level = Adm.getProcessedLevelFromCodes(fields[10], fields[11], fields[12], fields[13]);
            if (level != 0) {
                fields[7] = "ADM" + level;
            }
        }
        return fields;

    }

    public static String[] correctLastAdmCodeIfPossible(String[] fields) {
        if (FeatureClassCodeHelper.is_Adm(fields[6], fields[7])
                && !AbstractSimpleImporterProcessor.isEmptyField(fields, 0, false)) {
            int level = Adm.getProcessedLevelFromFeatureClassCode(fields[6], fields[7]);
            switch (level) {
            case 0:
                return fields;
            case 1:
                if (AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false)) {
                    fields[10] = fields[0];// asign adm1code with featureid
                }
                return fields;
            case 2:
                if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false)
                        && AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false)) {
                    fields[11] = fields[0];// asign adm2code with featureid
                }
                return fields;
            case 3:
                if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false)
                        && !AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false)
                        && AbstractSimpleImporterProcessor.isEmptyField(fields, 12, false)) {
                    fields[12] = fields[0];// asign adm3code with featureid
                }
                return fields;
            case 4:
                if (!AbstractSimpleImporterProcessor.isEmptyField(fields, 10, false)
                        && !AbstractSimpleImporterProcessor.isEmptyField(fields, 11, false)
                        && !AbstractSimpleImporterProcessor.isEmptyField(fields, 12, false)
                        && AbstractSimpleImporterProcessor.isEmptyField(fields, 13, false)) {
                    fields[13] = fields[0];// asign adm4code with featureid
                }
                return fields;

            default:
                return fields;
            }

        }
        return fields;
    }

    /**
     * @param regexp
     *            a regexp
     * @return A {@link Pattern} or null if the regexp are not corrects
     */
    public static Pattern compileRegex(String regexp) {
        try {
            if (regexp != null && !regexp.trim().equals("")) {
                return Pattern.compile(regexp, Pattern.CASE_INSENSITIVE);
            } else {
                return null;
            }
        } catch (RuntimeException e) {
            return null;
        }
    }

    /**
     * @param secsIn
     *            the number of seconds
     * @return a human reading strings. example :1 hour 6 minuts 40 seconds.
     */
    public static String formatSeconds(long secsIn) {

        long hours = secsIn / 3600,

                remainder = secsIn % 3600, minutes = remainder / 60, seconds = remainder % 60;
        String displayhours = hours == 0 ? "" : hours + " hour" + getPlural(hours);
        String displayMin = minutes == 0 ? "" : minutes + " minut" + getPlural(minutes);
        String displaySec = seconds == 0 ? "" : seconds + " second" + getPlural(seconds);
        return displayhours + displayMin + displaySec;
    }

    private static String getPlural(long count) {
        return count > 1 ? "s " : " ";
    }

}