eu.diacron.crawlservice.app.Util.java Source code

Introduction

Here is the source code for eu.diacron.crawlservice.app.Util.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package eu.diacron.crawlservice.app;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.vocabulary.RDF;
import com.hp.hpl.jena.vocabulary.RDFS;
import com.hp.hpl.jena.vocabulary.XSD;
import eu.diacron.crawlservice.config.Configuration;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.NameValuePair;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCReaderFactory;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;

/**
 *
 * @author eleni
 */
public final class Util {

    private final Properties prop = new Properties();

    public static String getCrawlid(URL urltoCrawl) {
        String crawlid = "";
        System.out.println("start crawling page");

        CredentialsProvider credsProvider = new BasicCredentialsProvider();

        credsProvider.setCredentials(new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT),
                new UsernamePasswordCredentials(Configuration.REMOTE_CRAWLER_USERNAME,
                        Configuration.REMOTE_CRAWLER_PASS));
        CloseableHttpClient httpclient = HttpClients.custom().setDefaultCredentialsProvider(credsProvider).build();
        try {
            //HttpPost httppost = new HttpPost("http://diachron.hanzoarchives.com/crawl");
            HttpPost httppost = new HttpPost(Configuration.REMOTE_CRAWLER_URL_CRAWL_INIT);

            List<NameValuePair> urlParameters = new ArrayList<NameValuePair>();
            urlParameters.add(new BasicNameValuePair("name", UUID.randomUUID().toString()));
            urlParameters.add(new BasicNameValuePair("scope", "page"));
            urlParameters.add(new BasicNameValuePair("seed", urltoCrawl.toString()));

            httppost.setEntity(new UrlEncodedFormEntity(urlParameters));

            System.out.println("Executing request " + httppost.getRequestLine());
            CloseableHttpResponse response = httpclient.execute(httppost);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println("----------------------------------------");

                BufferedReader in = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    crawlid = inputLine;
                }
                in.close();
                EntityUtils.consume(response.getEntity());
            } finally {
                response.close();
            }
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                httpclient.close();
            } catch (IOException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

        return crawlid;
    }

    public static String getCrawlStatusById(String crawlid) {

        String status = "";
        System.out.println("get crawlid");

        CredentialsProvider credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT),
                new UsernamePasswordCredentials(Configuration.REMOTE_CRAWLER_USERNAME,
                        Configuration.REMOTE_CRAWLER_PASS));

        CloseableHttpClient httpclient = HttpClients.custom().setDefaultCredentialsProvider(credsProvider).build();
        try {
            //HttpGet httpget = new HttpGet("http://diachron.hanzoarchives.com/crawl/" + crawlid);
            HttpGet httpget = new HttpGet(Configuration.REMOTE_CRAWLER_URL_CRAWL + crawlid);

            System.out.println("Executing request " + httpget.getRequestLine());
            CloseableHttpResponse response = httpclient.execute(httpget);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println("----------------------------------------");

                String result = "";

                BufferedReader in = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    System.out.println(inputLine);
                    result += inputLine;
                }
                in.close();

                // TO-DO should be removed in the future and handle it more gracefully
                result = result.replace("u'", "'");
                result = result.replace("'", "\"");

                JSONObject crawljson = new JSONObject(result);
                System.out.println("myObject " + crawljson.toString());

                status = crawljson.getString("status");

                EntityUtils.consume(response.getEntity());
            } catch (JSONException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                response.close();
            }
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                httpclient.close();
            } catch (IOException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

        return status;
    }

    public static JSONArray getwarcsByCrawlid(String crawlid) {

        JSONArray warcsArray = null;
        System.out.println("get crawlid");

        CredentialsProvider credsProvider = new BasicCredentialsProvider();
        /*        credsProvider.setCredentials(
         new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT),
         new UsernamePasswordCredentials("diachron", "7nD9dNGshTtficn"));
         */

        credsProvider.setCredentials(new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT),
                new UsernamePasswordCredentials(Configuration.REMOTE_CRAWLER_USERNAME,
                        Configuration.REMOTE_CRAWLER_PASS));
        CloseableHttpClient httpclient = HttpClients.custom().setDefaultCredentialsProvider(credsProvider).build();
        try {

            //HttpGet httpget = new HttpGet("http://diachron.hanzoarchives.com/warcs/" + crawlid);
            HttpGet httpget = new HttpGet(Configuration.REMOTE_CRAWLER_URL + crawlid);

            System.out.println("Executing request " + httpget.getRequestLine());
            CloseableHttpResponse response = httpclient.execute(httpget);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println("----------------------------------------");

                String result = "";

                BufferedReader in = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    System.out.println(inputLine);
                    result += inputLine;
                }
                in.close();

                result = result.replace("u'", "'");
                result = result.replace("'", "\"");

                warcsArray = new JSONArray(result);

                for (int i = 0; i < warcsArray.length(); i++) {

                    System.out.println("url to download: " + warcsArray.getString(i));

                }

                EntityUtils.consume(response.getEntity());
            } catch (JSONException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                response.close();
            }
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                httpclient.close();
            } catch (IOException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

        return warcsArray;
    }

    private static String getwarcByURL(String warcURLasString) {
        File warcfile = null;
        try {
            URL warcURL = new URL(warcURLasString);
            String fileName = FilenameUtils.getBaseName(warcURLasString);

            warcfile = new File(Configuration.TMP_FOLDER_CRAWL + fileName + ".gz");
            FileUtils.copyURLToFile(warcURL, warcfile);
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("warcfile.getAbsolutePath() " + warcfile.getAbsolutePath());
        return warcfile.getAbsolutePath();

    }

    public static JSONArray manageWarcFile(String warcfilepath) {
        String downloadedfile = getwarcByURL(warcfilepath);

        JSONArray json4RDFizing = RDFizeWarcFile(downloadedfile);
        //         String unziped_warcfilepath = unzipWarcFile(downloadedfile);
        //         String rdfized_warcfilepaty = RDFizeWarcFile(unziped_warcfilepath);
        //         return storeRDFizedWarcFile(rdfized_warcfilepaty);
        return json4RDFizing;
    }

    private static JSONArray RDFizeWarcFile(String warcfilepath) {
        JSONArray json4RDFizing = new JSONArray();
        FileInputStream is = null;
        try {
            String fileName = null;
            // Set up a local compressed WARC file for reading
            is = new FileInputStream(warcfilepath);
            // The file name identifies the ArchiveReader and indicates if it should be decompressed
            ArchiveReader ar = WARCReaderFactory.get(warcfilepath, is, true);
            // Once we have an ArchiveReader, we can work through each of the records it contains
            int i = 0;
            for (ArchiveRecord r : ar) {
                // The header file contains information such as the type of record, size, creation time, and URL
                System.out.println(r.getHeader());
                System.out.println(r.getHeader().getUrl());
                System.out.println();

                // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
                // Create a byte array that is as long as the record's stated length
                byte[] rawData = IOUtils.toByteArray(r, r.available());

                // Why don't we convert it to a string and print the start of it? Let's hope it's text!
                String content = new String(rawData);
                System.out.println(content.substring(0, Math.min(500, content.length())));
                System.out.println((content.length() > 500 ? "..." : ""));

                //get only HTTP info
                if (content.contains("HTTP/1.1 200 OK")) {

                    JSONObject pageBasics = new JSONObject();

                    pageBasics.put("headerURL", r.getHeader().getUrl());
                    System.out.println("headerURL " + r.getHeader().getUrl());

                    pageBasics.put("title", getTextBetweenTags(content, "title"));
                    pageBasics.put("firstParagraph", getTextBetweenTags(content, "p"));

                    json4RDFizing.put(pageBasics);

                }

                // Pretty printing to make the output more readable
                System.out.println("=-=-=-=-=-=-=-=-=");
                if (i++ > 4) {
                    break;
                }
            }

        } catch (FileNotFoundException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } catch (JSONException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                is.close();
            } catch (IOException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        return json4RDFizing;

    }

    public static String getTextBetweenTags(String content, String tag) {
        final Pattern pattern = Pattern.compile("<" + tag + ">(.+?)</" + tag + ">");
        final List<String> tagValues = new ArrayList<String>();
        final Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            tagValues.add(matcher.group(1));
        }
        return tagValues.get(0);

    }

    public static boolean storeRDFizedWarcFile(Model model, String rdfizedWarcFilepath) {
        try {
            //fileName = targetFileNameFullPath + ".nt";
            System.out.println("rdfizedWarcFilepath" + rdfizedWarcFilepath
                    + "Configuration.TMP_SERIALIZATION_RDF_FORMAT " + Configuration.TMP_SERIALIZATION_RDF_FORMAT);
            FileWriter outToSave = new FileWriter(rdfizedWarcFilepath);
            model.write(outToSave, Configuration.TMP_SERIALIZATION_RDF_FORMAT);
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        }
        return true;

    }

    public static void getAllCrawls() {
        CredentialsProvider credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT),
                new UsernamePasswordCredentials(Configuration.REMOTE_CRAWLER_USERNAME,
                        Configuration.REMOTE_CRAWLER_PASS));
        CloseableHttpClient httpclient = HttpClients.custom().setDefaultCredentialsProvider(credsProvider).build();
        try {
            //HttpGet httpget = new HttpGet("http://diachron.hanzoarchives.com/crawl");
            HttpGet httpget = new HttpGet(Configuration.REMOTE_CRAWLER_URL_CRAWL);

            System.out.println("Executing request " + httpget.getRequestLine());
            CloseableHttpResponse response = httpclient.execute(httpget);
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                System.out.println("----------------------------------------");

                System.out.println(response.getEntity().getContent());

                BufferedReader in = new BufferedReader(new InputStreamReader(response.getEntity().getContent()));
                String inputLine;
                while ((inputLine = in.readLine()) != null) {
                    System.out.println(inputLine);
                }
                in.close();
                EntityUtils.consume(response.getEntity());
            } finally {
                response.close();
            }
        } catch (IOException ex) {
            Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                httpclient.close();
            } catch (IOException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    private static String unzipWarcFile(String input_gzip_file) {

        String output_file = input_gzip_file.split(".gz")[0];

        byte[] buffer = new byte[1024];

        try {

            GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(input_gzip_file));

            FileOutputStream out = new FileOutputStream(output_file);

            int len;
            while ((len = gzis.read(buffer)) > 0) {
                out.write(buffer, 0, len);
            }

            gzis.close();
            out.close();

            System.out.println("Done");

        } catch (IOException ex) {
            ex.printStackTrace();
        }

        return output_file;
    }

    public static JSONArray concatArray(JSONArray... arrs) throws JSONException {
        JSONArray result = new JSONArray();
        for (JSONArray arr : arrs) {
            for (int i = 0; i < arr.length(); i++) {
                result.put(arr.get(i));
            }
        }
        return result;
    }

    public static boolean generateRDFModel(JSONArray jsonArray4RDFizing, String crawlid) {

        // Create the model and define some prefixes (for nice serialization in RDF/XML and TTL)
        Model model = ModelFactory.createDefaultModel();

        String base = "http://localhost:8181/Diacrawl";

        String ds = base + "#";

        String crawl_base = base + "/" + crawlid;
        String crawl_base_ns = crawl_base + "#";

        model.setNsPrefix("ds", ds);
        model.setNsPrefix("rdf", RDF.getURI());
        model.setNsPrefix("xsd", XSD.getURI());
        model.setNsPrefix("rdfs", RDFS.getURI());
        model.setNsPrefix("cr", crawl_base_ns);

        Resource pageinfo_node = model.createResource(crawl_base_ns + "pageinfo");

        for (int i = 0; i < jsonArray4RDFizing.length(); i++) {

            try {
                JSONObject ob = jsonArray4RDFizing.getJSONObject(i);
                Resource pageinfo_node_statement = model.createResource(crawl_base_ns + "pageinfo/" + i);
                pageinfo_node_statement.addProperty(RDF.type, pageinfo_node);
                pageinfo_node_statement.addProperty(RDFS.label, ob.getString("title"));
                pageinfo_node_statement.addProperty(RDFS.isDefinedBy, ob.getString("headerURL"));
                pageinfo_node_statement.addProperty(RDFS.comment, ob.getString("firstParagraph"));
            } catch (JSONException ex) {
                Logger.getLogger(Util.class.getName()).log(Level.SEVERE, null, ex);
            }

        }

        storeRDFizedWarcFile(model, Configuration.TMP_FOLDER_CRAWL + crawlid + "RDFized."
                + Configuration.TMP_SERIALIZATION_RDF_FILEEXT);
        //        DIACHRONModelConverter conv = new DIACHRONModelConverter();
        //        conv.convert(Configuration.TMP_FOLDER_CRAWL + crawlid + "RDFized."    + Configuration.TMP_SERIALIZATION_RDF_FILEEXT,
        //                     Configuration.TMP_FOLDER_CRAWL + crawlid + "diachroned." + Configuration.TMP_SERIALIZATION_RDF_FILEEXT);
        //        
        return true;
    }

    public static void main(String[] args) throws Exception {
        //        String urlstring = "http://hanzoenterprise.s3.amazonaws.com/RaC/DIACHRON/7551fda7-08cf-4662-9bd6-670006d1e027/7551fda7-08cf-4662-9bd6-670006d1e027-crawl/7551fda7-08cf-4662-9bd6-670006d1e027-crawl-20150707131529-00000-560e5316-271e-44b3-9222-0c9ce38ff61c.warc.gz?Signature=hbg6ecctMldLf%2BwA7ldIRT77O4s%3D&Expires=1436347898&AWSAccessKeyId=0WTGAF2NBCANE1MSZQG2&x-amz-storage-class=REDUCED_REDUNDANCY";
        //
        //        getwarcByURL(urlstring);

        //        URL a = new URL("http://example.com/");
        //        String crawlid  = getCrawlid(a);
        //        System.out.println("to crawl id mou einai "+crawlid);
        //        String status = getCrawlStatusById("ca23f7a8-02ba-4a39-afe3-a03cc6f5ea67");
        //        System.out.println("my status " + status);
        //
        //        JSONArray warcsArray = getwarcsByCrawlid("d0be3ec6-4566-4b6d-bb83-cf6cf95d1217");
        //
        //        for (int i = 0; i < warcsArray.length(); i++) {
        //
        //            System.out.println("url to download1: " + warcsArray.getString(i));
        //
        //        }
        String warcfile = "/home/eleni/Downloads/608c4b6b-42e7-40c1-a333-2b22c6f7b980-crawl-20150708110440-00000-c359dfd0-ce37-4f1c-a5d0-70c76e02f077.warc.gz";
        RDFizeWarcFile(warcfile);
    }

}