nu.validator.svgresearch.SvgDownloader.java Source code

Java tutorial

Introduction

Here is the source code for nu.validator.svgresearch.SvgDownloader.java

Source

/*
 * Copyright (c) 2008 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.svgresearch;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.mortbay.util.IO;

import com.hp.hpl.jena.iri.IRI;
import com.hp.hpl.jena.iri.IRIFactory;

public class SvgDownloader {

    private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();

    private static final HttpClient client = new HttpClient(manager);

    static {
        HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
        hcmp.setConnectionTimeout(5000);
        hcmp.setSoTimeout(5000);
        hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION, 100);
        hcmp.setMaxTotalConnections(200);
        HttpClientParams hcp = client.getParams();
        hcp.setBooleanParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, true);
        hcp.setIntParameter(HttpClientParams.MAX_REDIRECTS, 20); // Gecko
        // default
        client.getParams().setParameter("http.useragent", "SET_YOUR_OWN_UA");
    }

    private static String toHexString(byte[] md5) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < md5.length; i++) {
            byte b = md5[i];
            int asInt = ((int) b) & 0xFF;
            String s = Integer.toHexString(asInt);
            if (s.length() == 1) {
                sb.append('0');
            }
            sb.append(s);
        }
        return sb.toString();
    }

    private static void retrieve(String uri, File target) {
        for (int i = 0; i < 3; i++) {
            try {
                GetMethod m = new GetMethod(uri);
                m.setFollowRedirects(true);
                m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
                m.addRequestHeader("Accept", "image/svg+xml, */*");
                client.executeMethod(m);
                int status = m.getStatusCode();
                if (m.getStatusCode() != 200) {
                    System.err.println(status);
                    return;
                }
                InputStream in = m.getResponseBodyAsStream();
                FileOutputStream out = new FileOutputStream(target);
                IO.copy(in, out);
                out.flush();
                out.close();
                in.close();
                m.releaseConnection();
                return;
            } catch (Exception e) {
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                }
            }
        }
    }

    /**
     * @param args
     * @throws IOException
     * @throws NoSuchAlgorithmException
     */
    public static void main(String[] args) throws IOException, NoSuchAlgorithmException {
        SortedMap<String, String> theMap = new TreeMap<String, String>();
        MessageDigest md = MessageDigest.getInstance("MD5");
        IRIFactory fac = new IRIFactory();
        BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "utf-8"));
        String line;
        while ((line = in.readLine()) != null) {
            byte[] md5 = md.digest(line.getBytes("utf-8"));
            String md5str = toHexString(md5);
            IRI iri = fac.create("http://upload.wikimedia.org/wikipedia/commons/" + md5str.substring(0, 1) + '/'
                    + md5str.substring(0, 2) + '/' + line);
            String uri = iri.toASCIIString();
            theMap.put(md5str, uri);
        }
        Writer out = new OutputStreamWriter(new FileOutputStream(args[1]), "utf-8");
        for (Map.Entry<String, String> entry : theMap.entrySet()) {
            out.write(entry.getKey());
            out.write('\t');
            out.write(entry.getValue());
            out.write('\n');
        }
        out.flush();
        out.close();

        File dir = new File(args[2]);

        int total = theMap.size();
        int count = 0;
        for (Map.Entry<String, String> entry : theMap.entrySet()) {
            File target = new File(dir, entry.getKey() + ".svg");
            retrieve(entry.getValue(), target);
            count++;
            System.out.println(((double) count) / ((double) total));
        }

    }

}