com.ehsy.solr.util.SimplePostTool.java Source code

Introduction

Here is the source code for com.ehsy.solr.util.SimplePostTool.java
Source

package com.ehsy.solr.util;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import javax.xml.bind.DatatypeConverter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import org.apache.commons.io.FileUtils;

/**
 * A simple utility class for posting raw updates to a Solr server, 
 * has a main method so it can be run on the command line.
 * View this not as a best-practice code example, but as a standalone 
 * example built with an explicit purpose of not having external
 * jar dependencies.
 */
public class SimplePostTool {
    private static final String DEFAULT_POST_URL = "http://localhost:9080/solr/ehsy/update";
    private static final String VERSION_OF_THIS_TOOL = "1.5";

    private static final String DEFAULT_COMMIT = "yes";
    private static final String DEFAULT_OPTIMIZE = "no";
    private static final String DEFAULT_OUT = "no";
    private static final String DEFAULT_AUTO = "no";
    private static final String DEFAULT_RECURSIVE = "0";
    private static final int DEFAULT_WEB_DELAY = 10;
    private static final int MAX_WEB_DEPTH = 10;
    private static final String DEFAULT_CONTENT_TYPE = "application/xml";
    private static final String DEFAULT_FILE_TYPES = "xml,json,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";

    static final String DATA_MODE_FILES = "files";
    static final String DATA_MODE_ARGS = "args";
    static final String DATA_MODE_STDIN = "stdin";
    static final String DATA_MODE_WEB = "web";
    static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;

    // Input args
    boolean auto = false;
    int recursive = 0;
    int delay = 0;
    String fileTypes;
    URL solrUrl;
    OutputStream out = null;
    String type;
    String mode;
    boolean commit;
    boolean optimize;
    String[] args;

    private int currentDepth;

    static HashMap<String, String> mimeMap;
    GlobFileFilter globFileFilter;
    // Backlog for crawling
    List<LinkedHashSet<URL>> backlog = new ArrayList<>();
    Set<URL> visited = new HashSet<>();

    static final Set<String> DATA_MODES = new HashSet<>();
    static final String USAGE_STRING_SHORT = "Usage: java [SystemProperties] -jar post.jar [-h|-] [<file|folder|url|arg> [<file|folder|url|arg>...]]";

    // Used in tests to avoid doing actual network traffic
    static boolean mockMode = false;
    static PageFetcher pageFetcher;

    static {
        DATA_MODES.add(DATA_MODE_FILES);
        DATA_MODES.add(DATA_MODE_ARGS);
        DATA_MODES.add(DATA_MODE_STDIN);
        DATA_MODES.add(DATA_MODE_WEB);

        mimeMap = new HashMap<>();
        mimeMap.put("xml", "text/xml");
        mimeMap.put("csv", "text/csv");
        mimeMap.put("json", "application/json");
        mimeMap.put("pdf", "application/pdf");
        mimeMap.put("rtf", "text/rtf");
        mimeMap.put("html", "text/html");
        mimeMap.put("htm", "text/html");
        mimeMap.put("doc", "application/msword");
        mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
        mimeMap.put("ppt", "application/vnd.ms-powerpoint");
        mimeMap.put("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
        mimeMap.put("xls", "application/vnd.ms-excel");
        mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
        mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
        mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
        mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
        mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
        mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
        mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
        mimeMap.put("txt", "text/plain");
        mimeMap.put("log", "text/plain");
    }

    /**
     * See usage() for valid command line usage
     * @param args the params on the command line
     */
    public static void main(String[] args) {
        info("SimplePostTool version " + VERSION_OF_THIS_TOOL);
        if (0 < args.length && ("-help".equals(args[0]) || "--help".equals(args[0]) || "-h".equals(args[0]))) {
            usage();
        } else {
            final SimplePostTool t = parseArgsAndInit(args);
            t.execute();
        }
    }

    /**
     * After initialization, call execute to start the post job.
     * This method delegates to the correct mode method.
     */
    public void execute() {
        final long startTime = System.currentTimeMillis();
        if (DATA_MODE_FILES.equals(mode) && args.length > 0) {
            doFilesMode();
        } else if (DATA_MODE_ARGS.equals(mode) && args.length > 0) {
            doArgsMode();
        } else if (DATA_MODE_WEB.equals(mode) && args.length > 0) {
            doWebMode();
        } else if (DATA_MODE_STDIN.equals(mode)) {
            doStdinMode();
        } else {
            usageShort();
            return;
        }

        if (commit)
            commit();
        if (optimize)
            optimize();
        final long endTime = System.currentTimeMillis();
        displayTiming(endTime - startTime);
    }

    /**
     * Pretty prints the number of milliseconds taken to post the content to Solr
     * @param millis the time in milliseconds
     */
    private void displayTiming(long millis) {
        SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault());
        df.setTimeZone(TimeZone.getTimeZone("UTC"));
        System.out.println("Time spent: " + df.format(new Date(millis)));
    }

    /**
     * Parses incoming arguments and system params and initializes the tool
     * @param args the incoming cmd line args
     * @return an instance of SimplePostTool
     */
    public static SimplePostTool parseArgsAndInit(String[] args) {
        String urlStr = null;
        try {
            // Parse args
            final String mode = System.getProperty("data", DEFAULT_DATA_MODE);
            if (!DATA_MODES.contains(mode)) {
                fatal("System Property 'data' is not valid for this tool: " + mode);
            }
            String params = System.getProperty("params", "");
            urlStr = System.getProperty("url", DEFAULT_POST_URL);
            urlStr = SimplePostTool.appendParam(urlStr, params);
            URL url = new URL(urlStr);
            boolean auto = isOn(System.getProperty("auto", DEFAULT_AUTO));
            String type = System.getProperty("type");
            // Recursive
            int recursive = 0;
            String r = System.getProperty("recursive", DEFAULT_RECURSIVE);
            try {
                recursive = Integer.parseInt(r);
            } catch (Exception e) {
                if (isOn(r))
                    recursive = DATA_MODE_WEB.equals(mode) ? 1 : 999;
            }
            // Delay
            int delay = DATA_MODE_WEB.equals(mode) ? DEFAULT_WEB_DELAY : 0;
            try {
                delay = Integer.parseInt(System.getProperty("delay", "" + delay));
            } catch (Exception e) {
            }
            OutputStream out = isOn(System.getProperty("out", DEFAULT_OUT)) ? System.out : null;
            String fileTypes = System.getProperty("filetypes", DEFAULT_FILE_TYPES);
            boolean commit = isOn(System.getProperty("commit", DEFAULT_COMMIT));
            boolean optimize = isOn(System.getProperty("optimize", DEFAULT_OPTIMIZE));

            return new SimplePostTool(mode, url, auto, type, recursive, delay, fileTypes, out, commit, optimize,
                    args);
        } catch (MalformedURLException e) {
            fatal("System Property 'url' is not a valid URL: " + urlStr);
            return null;
        }
    }

    /**
     * Constructor which takes in all mandatory input for the tool to work.
     * Also see usage() for further explanation of the params.
     * @param mode whether to post files, web pages, params or stdin
     * @param url the Solr base Url to post to, should end with /update
     * @param auto if true, we'll guess type and add resourcename/url
     * @param type content-type of the data you are posting
     * @param recursive number of levels for file/web mode, or 0 if one file only
     * @param delay if recursive then delay will be the wait time between posts
     * @param fileTypes a comma separated list of file-name endings to accept for file/web
     * @param out an OutputStream to write output to, e.g. stdout to print to console
     * @param commit if true, will commit at end of posting
     * @param optimize if true, will optimize at end of posting
     * @param args a String[] of arguments, varies between modes
     */
    public SimplePostTool(String mode, URL url, boolean auto, String type, int recursive, int delay,
            String fileTypes, OutputStream out, boolean commit, boolean optimize, String[] args) {
        this.mode = mode;
        this.solrUrl = url;
        this.auto = auto;
        this.type = type;
        this.recursive = recursive;
        this.delay = delay;
        this.fileTypes = fileTypes;
        this.globFileFilter = getFileFilterFromFileTypes(fileTypes);
        this.out = out;
        this.commit = commit;
        this.optimize = optimize;
        this.args = args;
        pageFetcher = new PageFetcher();
    }

    public SimplePostTool() {
    }

    //
    // Do some action depending on which mode we have
    //
    private void doFilesMode() {
        currentDepth = 0;
        // Skip posting files if special param "-" given  
        if (!args[0].equals("-")) {
            info("Posting files to base url " + solrUrl
                    + (!auto ? " using content-type " + (type == null ? DEFAULT_CONTENT_TYPE : type) : "") + "..");
            if (auto)
                info("Entering auto mode. File endings considered are " + fileTypes);
            if (recursive > 0)
                info("Entering recursive mode, max depth=" + recursive + ", delay=" + delay + "s");
            int numFilesPosted = postFiles(args, 0, out, type);
            info(numFilesPosted + " files indexed.");
        }
    }

    private void doArgsMode() {
        info("POSTing args to " + solrUrl + "..");
        for (String a : args) {
            postData(stringToStream(a), null, out, type, solrUrl);
        }
    }

    private int doWebMode() {
        reset();
        int numPagesPosted = 0;
        try {
            if (type != null) {
                fatal("Specifying content-type with \"-Ddata=web\" is not supported");
            }
            if (args[0].equals("-")) {
                // Skip posting url if special param "-" given  
                return 0;
            }
            // Set Extracting handler as default
            solrUrl = appendUrlPath(solrUrl, "/extract");

            info("Posting web pages to Solr url " + solrUrl);
            auto = true;
            info("Entering auto mode. Indexing pages with content-types corresponding to file endings "
                    + fileTypes);
            if (recursive > 0) {
                if (recursive > MAX_WEB_DEPTH) {
                    recursive = MAX_WEB_DEPTH;
                    warn("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "...");
                }
                if (delay < DEFAULT_WEB_DELAY)
                    warn("Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked");
                info("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s");
            }
            numPagesPosted = postWebPages(args, 0, out);
            info(numPagesPosted + " web pages indexed.");
        } catch (MalformedURLException e) {
            fatal("Wrong URL trying to append /extract to " + solrUrl);
        }
        return numPagesPosted;
    }

    private void doStdinMode() {
        info("POSTing stdin to " + solrUrl + "..");
        postData(System.in, null, out, type, solrUrl);
    }

    private void reset() {
        fileTypes = DEFAULT_FILE_TYPES;
        globFileFilter = this.getFileFilterFromFileTypes(fileTypes);
        backlog = new ArrayList<>();
        visited = new HashSet<>();
    }

    //
    // USAGE
    //
    private static void usageShort() {
        System.out.println(
                USAGE_STRING_SHORT + "\n" + "       Please invoke with -h option for extended usage help.");
    }

    private static void usage() {
        System.out.println(USAGE_STRING_SHORT + "\n\n" + "Supported System Properties and their defaults:\n"
                + "  -Ddata=files|web|args|stdin (default=" + DEFAULT_DATA_MODE + ")\n"
                + "  -Dtype=<content-type> (default=" + DEFAULT_CONTENT_TYPE + ")\n"
                + "  -Durl=<solr-update-url> (default=" + DEFAULT_POST_URL + ")\n" + "  -Dauto=yes|no (default="
                + DEFAULT_AUTO + ")\n" + "  -Drecursive=yes|no|<depth> (default=" + DEFAULT_RECURSIVE + ")\n"
                + "  -Ddelay=<seconds> (default=0 for files, 10 for web)\n"
                + "  -Dfiletypes=<type>[,<type>,...] (default=" + DEFAULT_FILE_TYPES + ")\n"
                + "  -Dparams=\"<key>=<value>[&<key>=<value>...]\" (values must be URL-encoded)\n"
                + "  -Dcommit=yes|no (default=" + DEFAULT_COMMIT + ")\n" + "  -Doptimize=yes|no (default="
                + DEFAULT_OPTIMIZE + ")\n" + "  -Dout=yes|no (default=" + DEFAULT_OUT + ")\n\n"
                + "This is a simple command line tool for POSTing raw data to a Solr\n"
                + "port.  Data can be read from files specified as commandline args,\n"
                + "URLs specified as args, as raw commandline arg strings or via STDIN.\n" + "Examples:\n"
                + "  java -jar post.jar *.xml\n"
                + "  java -Ddata=args  -jar post.jar '<delete><id>42</id></delete>'\n"
                + "  java -Ddata=stdin -jar post.jar < hd.xml\n"
                + "  java -Ddata=web -jar post.jar http://example.com/\n"
                + "  java -Dtype=text/csv -jar post.jar *.csv\n"
                + "  java -Dtype=application/json -jar post.jar *.json\n"
                + "  java -Durl=http://localhost:8983/solr/update/extract -Dparams=literal.id=a -Dtype=application/pdf -jar post.jar a.pdf\n"
                + "  java -Dauto -jar post.jar *\n" + "  java -Dauto -Drecursive -jar post.jar afolder\n"
                + "  java -Dauto -Dfiletypes=ppt,html -jar post.jar afolder\n"
                + "The options controlled by System Properties include the Solr\n"
                + "URL to POST to, the Content-Type of the data, whether a commit\n"
                + "or optimize should be executed, and whether the response should\n"
                + "be written to STDOUT. If auto=yes the tool will try to set type\n"
                + "and url automatically from file name. When posting rich documents\n"
                + "the file name will be propagated as \"resource.name\" and also used\n"
                + "as \"literal.id\". You may override these or any other request parameter\n"
                + "through the -Dparams property. To do a commit only, use \"-\" as argument.\n"
                + "The web mode is a simple crawler following links within domain, default delay=10s.");
    }

    /** Post all filenames provided in args
     * @param args array of file names
     * @param startIndexInArgs offset to start
     * @param out output stream to post data to
     * @param type default content-type to use when posting (may be overridden in auto mode)
     * @return number of files posted
     * */
    public int postFiles(String[] args, int startIndexInArgs, OutputStream out, String type) {
        reset();
        int filesPosted = 0;
        for (int j = startIndexInArgs; j < args.length; j++) {
            File srcFile = new File(args[j]);
            if (srcFile.isDirectory() && srcFile.canRead()) {
                filesPosted += postDirectory(srcFile, out, type);
            } else if (srcFile.isFile() && srcFile.canRead()) {
                filesPosted += postFiles(new File[] { srcFile }, out, type);
            } else {
                File parent = srcFile.getParentFile();
                if (parent == null)
                    parent = new File(".");
                String fileGlob = srcFile.getName();
                GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
                File[] files = parent.listFiles(ff);
                if (files == null || files.length == 0) {
                    warn("No files or directories matching " + srcFile);
                    continue;
                }
                filesPosted += postFiles(parent.listFiles(ff), out, type);
            }
        }
        return filesPosted;
    }

    /** Post all filenames provided in args
     * @param files array of Files
     * @param startIndexInArgs offset to start
     * @param out output stream to post data to
     * @param type default content-type to use when posting (may be overridden in auto mode)
     * @return number of files posted
     * */
    public int postFiles(File[] files, int startIndexInArgs, OutputStream out, String type) {
        reset();
        int filesPosted = 0;
        for (File srcFile : files) {
            if (srcFile.isDirectory() && srcFile.canRead()) {
                filesPosted += postDirectory(srcFile, out, type);
            } else if (srcFile.isFile() && srcFile.canRead()) {
                filesPosted += postFiles(new File[] { srcFile }, out, type);
            } else {
                File parent = srcFile.getParentFile();
                if (parent == null)
                    parent = new File(".");
                String fileGlob = srcFile.getName();
                GlobFileFilter ff = new GlobFileFilter(fileGlob, false);
                File[] fileList = parent.listFiles(ff);
                if (fileList == null || fileList.length == 0) {
                    warn("No files or directories matching " + srcFile);
                    continue;
                }
                filesPosted += postFiles(fileList, out, type);
            }
        }
        return filesPosted;
    }

    /**
     * Posts a whole directory
     * @return number of files posted total
     */
    private int postDirectory(File dir, OutputStream out, String type) {
        if (dir.isHidden() && !dir.getName().equals("."))
            return (0);
        info("Indexing directory " + dir.getPath() + " (" + dir.listFiles(globFileFilter).length + " files, depth="
                + currentDepth + ")");
        int posted = 0;
        posted += postFiles(dir.listFiles(globFileFilter), out, type);
        if (recursive > currentDepth) {
            for (File d : dir.listFiles()) {
                if (d.isDirectory()) {
                    currentDepth++;
                    posted += postDirectory(d, out, type);
                    currentDepth--;
                }
            }
        }
        return posted;
    }

    /**
     * Posts a list of file names
     * @return number of files posted
     */
    int postFiles(File[] files, OutputStream out, String type) {
        int filesPosted = 0;
        for (File srcFile : files) {
            try {
                if (!srcFile.isFile() || srcFile.isHidden())
                    continue;
                postFile(srcFile, out, type);
                Thread.sleep(delay * 1000);
                filesPosted++;
            } catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
        return filesPosted;
    }

    /**
     * This method takes as input a list of start URL strings for crawling,
     * adds each one to a backlog and then starts crawling
     * @param args the raw input args from main()
     * @param startIndexInArgs offset for where to start
     * @param out outputStream to write results to
     * @return the number of web pages posted
     */
    public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) {
        reset();
        LinkedHashSet<URL> s = new LinkedHashSet<>();
        for (int j = startIndexInArgs; j < args.length; j++) {
            try {
                URL u = new URL(normalizeUrlEnding(args[j]));
                s.add(u);
            } catch (MalformedURLException e) {
                warn("Skipping malformed input URL: " + args[j]);
            }
        }
        // Add URLs to level 0 of the backlog and start recursive crawling
        backlog.add(s);
        return webCrawl(0, out);
    }

    /**
     * Normalizes a URL string by removing anchor part and trailing slash
     * @return the normalized URL string
     */
    protected static String normalizeUrlEnding(String link) {
        if (link.indexOf("#") > -1)
            link = link.substring(0, link.indexOf("#"));
        if (link.endsWith("?"))
            link = link.substring(0, link.length() - 1);
        if (link.endsWith("/"))
            link = link.substring(0, link.length() - 1);
        return link;
    }

    /**
     * A very simple crawler, pulling URLs to fetch from a backlog and then
     * recurses N levels deep if recursive>0. Links are parsed from HTML
     * through first getting an XHTML version using SolrCell with extractOnly,
     * and followed if they are local. The crawler pauses for a default delay
     * of 10 seconds between each fetch, this can be configured in the delay
     * variable. This is only meant for test purposes, as it does not respect
     * robots or anything else fancy :)
     * @param level which level to crawl
     * @param out output stream to write to
     * @return number of pages crawled on this level and below
     */
    protected int webCrawl(int level, OutputStream out) {
        int numPages = 0;
        LinkedHashSet<URL> stack = backlog.get(level);
        int rawStackSize = stack.size();
        stack.removeAll(visited);
        int stackSize = stack.size();
        LinkedHashSet<URL> subStack = new LinkedHashSet<>();
        info("Entering crawl at level " + level + " (" + rawStackSize + " links total, " + stackSize + " new)");
        for (URL u : stack) {
            try {
                visited.add(u);
                PageFetcherResult result = pageFetcher.readPageFromUrl(u);
                if (result.httpStatus == 200) {
                    u = (result.redirectUrl != null) ? result.redirectUrl : u;
                    URL postUrl = new URL(
                            appendParam(solrUrl.toString(), "literal.id=" + URLEncoder.encode(u.toString(), "UTF-8")
                                    + "&literal.url=" + URLEncoder.encode(u.toString(), "UTF-8")));
                    boolean success = postData(new ByteArrayInputStream(result.content), null, out,
                            result.contentType, postUrl);
                    if (success) {
                        info("POSTed web resource " + u + " (depth: " + level + ")");
                        Thread.sleep(delay * 1000);
                        numPages++;
                        // Pull links from HTML pages only
                        if (recursive > level && result.contentType.equals("text/html")) {
                            Set<URL> children = pageFetcher.getLinksFromWebPage(u,
                                    new ByteArrayInputStream(result.content), result.contentType, postUrl);
                            subStack.addAll(children);
                        }
                    } else {
                        warn("An error occurred while posting " + u);
                    }
                } else {
                    warn("The URL " + u + " returned a HTTP result status of " + result.httpStatus);
                }
            } catch (IOException e) {
                warn("Caught exception when trying to open connection to " + u + ": " + e.getMessage());
            } catch (InterruptedException e) {
                throw new RuntimeException();
            }
        }
        if (!subStack.isEmpty()) {
            backlog.add(subStack);
            numPages += webCrawl(level + 1, out);
        }
        return numPages;
    }

    /**
     * Reads an input stream into a byte array
     * @param is the input stream
     * @return the byte array
     * @throws IOException If there is a low-level I/O error.
     */
    protected byte[] inputStreamToByteArray(InputStream is) throws IOException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        int next = is.read();
        while (next > -1) {
            bos.write(next);
            next = is.read();
        }
        bos.flush();
        is.close();
        return bos.toByteArray();
    }

    /**
     * Computes the full URL based on a base url and a possibly relative link found
     * in the href param of an HTML anchor.
     * @param baseUrl the base url from where the link was found
     * @param link the absolute or relative link
     * @return the string version of the full URL
     */
    protected String computeFullUrl(URL baseUrl, String link) {
        if (link == null || link.length() == 0) {
            return null;
        }
        if (!link.startsWith("http")) {
            if (link.startsWith("/")) {
                link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
            } else {
                if (link.contains(":")) {
                    return null; // Skip non-relative URLs
                }
                String path = baseUrl.getPath();
                if (!path.endsWith("/")) {
                    int sep = path.lastIndexOf("/");
                    String file = path.substring(sep + 1);
                    if (file.contains(".") || file.contains("?"))
                        path = path.substring(0, sep);
                }
                link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link;
            }
        }
        link = normalizeUrlEnding(link);
        String l = link.toLowerCase(Locale.ROOT);
        // Simple brute force skip images
        if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) {
            return null; // Skip images
        }
        return link;
    }

    /**
     * Uses the mime-type map to reverse lookup whether the file ending for our type
     * is supported by the fileTypes option
     * @param type what content-type to lookup
     * @return true if this is a supported content type
     */
    protected boolean typeSupported(String type) {
        for (String key : mimeMap.keySet()) {
            if (mimeMap.get(key).equals(type)) {
                if (fileTypes.contains(key))
                    return true;
            }
        }
        return false;
    }

    /**
     * Tests if a string is either "true", "on", "yes" or "1"
     * @param property the string to test
     * @return true if "on"
     */
    protected static boolean isOn(String property) {
        return ("true,on,yes,1".indexOf(property) > -1);
    }

    static void warn(String msg) {
        System.err.println("SimplePostTool: WARNING: " + msg);
    }

    static void info(String msg) {
        System.out.println(msg);
    }

    static void fatal(String msg) {
        System.err.println("SimplePostTool: FATAL: " + msg);
        System.exit(2);
    }

    /**
     * Does a simple commit operation 
     */
    public void commit() {
        info("COMMITting Solr index changes to " + solrUrl + "..");
        doGet(appendParam(solrUrl.toString(), "commit=true"));
    }

    /**
     * Does a simple optimize operation 
     */
    public void optimize() {
        info("Performing an OPTIMIZE to " + solrUrl + "..");
        doGet(appendParam(solrUrl.toString(), "optimize=true"));
    }

    /**
     * Appends a URL query parameter to a URL 
     * @param url the original URL
     * @param param the parameter(s) to append, separated by "&"
     * @return the string version of the resulting URL
     */
    public static String appendParam(String url, String param) {
        String[] pa = param.split("&");
        for (String p : pa) {
            if (p.trim().length() == 0)
                continue;
            String[] kv = p.split("=");
            if (kv.length == 2) {
                url = url + (url.indexOf('?') > 0 ? "&" : "?") + kv[0] + "=" + kv[1];
            } else {
                warn("Skipping param " + p + " which is not on form key=value");
            }
        }
        return url;
    }

    /**
     * Opens the file and posts it's contents to the solrUrl,
     * writes to response to output. 
     */
    public void postFile(File file, OutputStream output, String type) {
        InputStream is = null;
        try {
            URL url = solrUrl;
            if (auto) {
                if (type == null) {
                    type = guessType(file);
                }
                if (type != null) {
                    if (type.equals("text/xml") || type.equals("text/csv") || type.equals("application/json")) {
                        // Default handler
                    } else {
                        // SolrCell
                        String urlStr = appendUrlPath(solrUrl, "/extract").toString();
                        if (urlStr.indexOf("resource.name") == -1)
                            urlStr = appendParam(urlStr,
                                    "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
                        if (urlStr.indexOf("literal.id") == -1)
                            urlStr = appendParam(urlStr,
                                    "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), "UTF-8"));
                        url = new URL(urlStr);
                    }
                } else {
                    warn("Skipping " + file.getName() + ". Unsupported file type for auto mode.");
                    return;
                }
            } else {
                if (type == null)
                    type = DEFAULT_CONTENT_TYPE;
            }
            info("POSTing file " + file.getName() + (auto ? " (" + type + ")" : ""));
            is = new FileInputStream(file);
            postData(is, (int) file.length(), output, type, url);
        } catch (IOException e) {
            e.printStackTrace();
            warn("Can't open/read file: " + file);
        } finally {
            try {
                if (is != null)
                    is.close();
                //move update file to ../backup added by niko 2014-10-16
                if (file.exists() && file.canWrite()) {
                    FileUtils.copyFileToDirectory(file,
                            new File(file.getParentFile().getParent() + File.separator + "backup"), false);
                    boolean del = file.delete();
                    info("delete file " + file.getName() + "\t" + del);
                }
            } catch (IOException e) {
                fatal("IOException while closing file: " + e);
            }
        }
    }

    /**
     * Appends to the path of the URL
     * @param url the URL
     * @param append the path to append
     * @return the final URL version 
     */
    protected static URL appendUrlPath(URL url, String append) throws MalformedURLException {
        return new URL(url.getProtocol() + "://" + url.getAuthority() + url.getPath() + append
                + (url.getQuery() != null ? "?" + url.getQuery() : ""));
    }

    /**
     * Guesses the type of a file, based on file name suffix
     * @param file the file
     * @return the content-type guessed
     */
    protected static String guessType(File file) {
        String name = file.getName();
        String suffix = name.substring(name.lastIndexOf(".") + 1);
        return mimeMap.get(suffix.toLowerCase(Locale.ROOT));
    }

    /**
     * Performs a simple get on the given URL
     */
    public static void doGet(String url) {
        try {
            doGet(new URL(url));
        } catch (MalformedURLException e) {
            warn("The specified URL " + url + " is not a valid URL. Please check");
        }
    }

    /**
     * Performs a simple get on the given URL
     */
    public static void doGet(URL url) {
        try {
            if (mockMode)
                return;
            HttpURLConnection urlc = (HttpURLConnection) url.openConnection();
            if (url.getUserInfo() != null) {
                String encoding = DatatypeConverter
                        .printBase64Binary(url.getUserInfo().getBytes(StandardCharsets.US_ASCII));
                urlc.setRequestProperty("Authorization", "Basic " + encoding);
            }
            urlc.connect();
            checkResponseCode(urlc);
        } catch (IOException e) {
            warn("An error occurred posting data to " + url + ". Please check that Solr is running.");
        }
    }

    /**
     * Reads data from the data stream and posts it to solr,
     * writes to the response to output
     * @return true if success
     */
    public boolean postData(InputStream data, Integer length, OutputStream output, String type, URL url) {
        if (mockMode)
            return true;
        boolean success = true;
        if (type == null)
            type = DEFAULT_CONTENT_TYPE;
        HttpURLConnection urlc = null;
        try {
            try {
                urlc = (HttpURLConnection) url.openConnection();
                try {
                    urlc.setRequestMethod("POST");
                } catch (ProtocolException e) {
                    fatal("Shouldn't happen: HttpURLConnection doesn't support POST??" + e);
                }
                urlc.setDoOutput(true);
                urlc.setDoInput(true);
                urlc.setUseCaches(false);
                urlc.setAllowUserInteraction(false);
                urlc.setRequestProperty("Content-type", type);
                if (url.getUserInfo() != null) {
                    String encoding = DatatypeConverter
                            .printBase64Binary(url.getUserInfo().getBytes(StandardCharsets.US_ASCII));
                    urlc.setRequestProperty("Authorization", "Basic " + encoding);
                }
                if (null != length)
                    urlc.setFixedLengthStreamingMode(length);
                urlc.connect();
            } catch (IOException e) {
                fatal("Connection error (is Solr running at " + solrUrl + " ?): " + e);
                success = false;
            }

            try (final OutputStream out = urlc.getOutputStream()) {
                pipe(data, out);
            } catch (IOException e) {
                fatal("IOException while posting data: " + e);
                success = false;
            }

            try {
                success &= checkResponseCode(urlc);
                try (final InputStream in = urlc.getInputStream()) {
                    pipe(in, output);
                }
            } catch (IOException e) {
                warn("IOException while reading response: " + e);
                success = false;
            }
        } finally {
            if (urlc != null)
                urlc.disconnect();
        }
        return success;
    }

    private static boolean checkResponseCode(HttpURLConnection urlc) throws IOException {
        if (urlc.getResponseCode() >= 400) {
            warn("Solr returned an error #" + urlc.getResponseCode() + " (" + urlc.getResponseMessage()
                    + ") for url: " + urlc.getURL());
            Charset charset = StandardCharsets.ISO_8859_1;
            final String contentType = urlc.getContentType();
            // code cloned from ContentStreamBase, but post.jar should be standalone!
            if (contentType != null) {
                int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
                if (idx > 0) {
                    charset = Charset.forName(contentType.substring(idx + "charset=".length()).trim());
                }
            }
            // Print the response returned by Solr
            try (InputStream errStream = urlc.getErrorStream()) {
                if (errStream != null) {
                    BufferedReader br = new BufferedReader(new InputStreamReader(errStream, charset));
                    final StringBuilder response = new StringBuilder("Response: ");
                    int ch;
                    while ((ch = br.read()) != -1) {
                        response.append((char) ch);
                    }
                    warn(response.toString().trim());
                }
            }
            return false;
        }
        return true;
    }

    /**
     * Converts a string to an input stream 
     * @param s the string
     * @return the input stream
     */
    public static InputStream stringToStream(String s) {
        return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
    }

    /**
     * Pipes everything from the source to the dest.  If dest is null, 
     * then everything is read from source and thrown away.
     */
    private static void pipe(InputStream source, OutputStream dest) throws IOException {
        byte[] buf = new byte[1024];
        int read = 0;
        while ((read = source.read(buf)) >= 0) {
            if (null != dest)
                dest.write(buf, 0, read);
        }
        if (null != dest)
            dest.flush();
    }

    public GlobFileFilter getFileFilterFromFileTypes(String fileTypes) {
        String glob;
        if (fileTypes.equals("*"))
            glob = ".*";
        else
            glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
        return new GlobFileFilter(glob, true);
    }

    //
    // Utility methods for XPath handing
    //

    /**
     * Gets all nodes matching an XPath
     */
    public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException {
        XPathFactory factory = XPathFactory.newInstance();
        XPath xp = factory.newXPath();
        XPathExpression expr = xp.compile(xpath);
        return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
    }

    /**
     * Gets the string content of the matching an XPath
     * @param n the node (or doc)
     * @param xpath the xpath string
     * @param concatAll if true, text from all matching nodes will be concatenated, else only the first returned
     */
    public static String getXP(Node n, String xpath, boolean concatAll) throws XPathExpressionException {
        NodeList nodes = getNodesFromXP(n, xpath);
        StringBuilder sb = new StringBuilder();
        if (nodes.getLength() > 0) {
            for (int i = 0; i < nodes.getLength(); i++) {
                sb.append(nodes.item(i).getNodeValue() + " ");
                if (!concatAll)
                    break;
            }
            return sb.toString().trim();
        } else
            return "";
    }

    /**
     * Takes a string as input and returns a DOM 
     */
    public static Document makeDom(byte[] in) throws SAXException, IOException, ParserConfigurationException {
        InputStream is = new ByteArrayInputStream(in);
        Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);
        return dom;
    }

    /**
     * Inner class to filter files based on glob wildcards
     */
    class GlobFileFilter implements FileFilter {
        private String _pattern;
        private Pattern p;

        public GlobFileFilter(String pattern, boolean isRegex) {
            _pattern = pattern;
            if (!isRegex) {
                _pattern = _pattern.replace("^", "\\^").replace("$", "\\$").replace(".", "\\.").replace("(", "\\(")
                        .replace(")", "\\)").replace("+", "\\+").replace("*", ".*").replace("?", ".");
                _pattern = "^" + _pattern + "$";
            }

            try {
                p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE);
            } catch (PatternSyntaxException e) {
                fatal("Invalid type list " + pattern + ". " + e.getDescription());
            }
        }

        @Override
        public boolean accept(File file) {
            return p.matcher(file.getName()).find();
        }
    }

    //
    // Simple crawler class which can fetch a page and check for robots.txt
    //
    class PageFetcher {
        Map<String, List<String>> robotsCache;
        final String DISALLOW = "Disallow:";

        public PageFetcher() {
            robotsCache = new HashMap<>();
        }

        public PageFetcherResult readPageFromUrl(URL u) {
            PageFetcherResult res = new PageFetcherResult();
            try {
                if (isDisallowedByRobots(u)) {
                    warn("The URL " + u + " is disallowed by robots.txt and will not be crawled.");
                    res.httpStatus = 403;
                    visited.add(u);
                    return res;
                }
                res.httpStatus = 404;
                HttpURLConnection conn = (HttpURLConnection) u.openConnection();
                conn.setRequestProperty("User-Agent",
                        "SimplePostTool-crawler/" + VERSION_OF_THIS_TOOL + " (http://lucene.apache.org/solr/)");
                conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
                conn.connect();
                res.httpStatus = conn.getResponseCode();
                if (!normalizeUrlEnding(conn.getURL().toString()).equals(normalizeUrlEnding(u.toString()))) {
                    info("The URL " + u + " caused a redirect to " + conn.getURL());
                    u = conn.getURL();
                    res.redirectUrl = u;
                    visited.add(u);
                }
                if (res.httpStatus == 200) {
                    // Raw content type of form "text/html; encoding=utf-8"
                    String rawContentType = conn.getContentType();
                    String type = rawContentType.split(";")[0];
                    if (typeSupported(type)) {
                        String encoding = conn.getContentEncoding();
                        InputStream is;
                        if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
                            is = new GZIPInputStream(conn.getInputStream());
                        } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
                            is = new InflaterInputStream(conn.getInputStream(), new Inflater(true));
                        } else {
                            is = conn.getInputStream();
                        }

                        // Read into memory, so that we later can pull links from the page without re-fetching 
                        res.content = inputStreamToByteArray(is);
                        is.close();
                    } else {
                        warn("Skipping URL with unsupported type " + type);
                        res.httpStatus = 415;
                    }
                }
            } catch (IOException e) {
                warn("IOException when reading page from url " + u + ": " + e.getMessage());
            }
            return res;
        }

        public boolean isDisallowedByRobots(URL url) {
            String host = url.getHost();
            String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
            List<String> disallows = robotsCache.get(host);
            if (disallows == null) {
                disallows = new ArrayList<>();
                URL urlRobot;
                try {
                    urlRobot = new URL(strRobot);
                    disallows = parseRobotsTxt(urlRobot.openStream());
                } catch (MalformedURLException e) {
                    return true; // We cannot trust this robots URL, should not happen
                } catch (IOException e) {
                    // There is no robots.txt, will cache an empty disallow list
                }
            }

            robotsCache.put(host, disallows);

            String strURL = url.getFile();
            for (String path : disallows) {
                if (path.equals("/") || strURL.indexOf(path) == 0)
                    return true;
            }
            return false;
        }

        /**
         * Very simple robots.txt parser which obeys all Disallow lines regardless
         * of user agent or whether there are valid Allow: lines.
         * @param is Input stream of the robots.txt file
         * @return a list of disallow paths
         * @throws IOException if problems reading the stream
         */
        protected List<String> parseRobotsTxt(InputStream is) throws IOException {
            List<String> disallows = new ArrayList<>();
            BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
            String l;
            while ((l = r.readLine()) != null) {
                String[] arr = l.split("#");
                if (arr.length == 0)
                    continue;
                l = arr[0].trim();
                if (l.startsWith(DISALLOW)) {
                    l = l.substring(DISALLOW.length()).trim();
                    if (l.length() == 0)
                        continue;
                    disallows.add(l);
                }
            }
            is.close();
            return disallows;
        }

        /**
         * Finds links on a web page, using /extract?extractOnly=true
         * @param u the URL of the web page
         * @param is the input stream of the page
         * @param type the content-type
         * @param postUrl the URL (typically /solr/extract) in order to pull out links
         * @return a set of URLs parsed from the page
         */
        protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) {
            Set<URL> l = new HashSet<>();
            URL url = null;
            try {
                ByteArrayOutputStream os = new ByteArrayOutputStream();
                URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true"));
                boolean success = postData(is, null, os, type, extractUrl);
                if (success) {
                    Document d = makeDom(os.toByteArray());
                    String innerXml = getXP(d, "/response/str/text()[1]", false);
                    d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8));
                    NodeList links = getNodesFromXP(d, "/html/body//a/@href");
                    for (int i = 0; i < links.getLength(); i++) {
                        String link = links.item(i).getTextContent();
                        link = computeFullUrl(u, link);
                        if (link == null)
                            continue;
                        url = new URL(link);
                        if (url.getAuthority() == null || !url.getAuthority().equals(u.getAuthority()))
                            continue;
                        l.add(url);
                    }
                }
            } catch (MalformedURLException e) {
                warn("Malformed URL " + url);
            } catch (IOException e) {
                warn("IOException opening URL " + url + ": " + e.getMessage());
            } catch (Exception e) {
                throw new RuntimeException();
            }
            return l;
        }
    }

    /**
     * Utility class to hold the result form a page fetch
     */
    public class PageFetcherResult {
        int httpStatus = 200;
        String contentType = "text/html";
        URL redirectUrl = null;
        byte[] content;
    }
}