analysePortalU.AnalysePortalUData.java Source code

Java tutorial

Introduction

Here is the source code for analysePortalU.AnalysePortalUData.java

Source

/*
 * **************************************************-
 * ingrid-iplug-opensearch:war
 * ==================================================
 * Copyright (C) 2014 - 2017 wemove digital solutions GmbH
 * ==================================================
 * Licensed under the EUPL, Version 1.1 or  as soon they will be
 * approved by the European Commission - subsequent versions of the
 * EUPL (the "Licence");
 * 
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 * http://ec.europa.eu/idabc/eupl5
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and
 * limitations under the Licence.
 * **************************************************#
 */
package analysePortalU;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.URL;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpClientParams;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import de.ingrid.utils.xml.ConfigurableNamespaceContext;
import de.ingrid.utils.xpath.XPathUtils;

public class AnalysePortalUData {

    public static int PAGE_SIZE = 50;

    public void analyse()
            throws HttpException, IOException, ParserConfigurationException, SAXException, TransformerException {

        Map<String, Map<String, String>> resultMap = new HashMap<String, Map<String, String>>();

        Scanner in = new Scanner(getClass().getClassLoader().getResourceAsStream("keywords.txt"));
        while (in.hasNextLine()) {
            String keyword = URLEncoder.encode(in.nextLine().trim(), "UTF-8");

            int currentPage = 1;
            boolean moreResults = true;

            while (moreResults) {

                String url = "http://www.portalu.de/opensearch/query?q=" + keyword.replace(' ', '+')
                        + "+datatype:metadata+ranking:score&h=" + PAGE_SIZE + "&detail=1&ingrid=1&p=" + currentPage;

                HttpClientParams httpClientParams = new HttpClientParams();
                HttpConnectionManager httpConnectionManager = new SimpleHttpConnectionManager();
                httpClientParams.setSoTimeout(60 * 1000);
                httpConnectionManager.getParams().setConnectionTimeout(60 * 1000);
                httpConnectionManager.getParams().setSoTimeout(60 * 1000);

                HttpClient client = new HttpClient(httpClientParams, httpConnectionManager);
                HttpMethod method = new GetMethod(url);

                // set a request header
                // this can change in the result of the response since it might
                // be
                // interpreted differently
                // method.addRequestHeader("Accept-Language", language);
                // //"de-DE,de;q=0.8,en-US;q=0.6,en;q=0.4");

                System.out.println("Query: " + url);
                int status = client.executeMethod(method);
                if (status == 200) {
                    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
                    DocumentBuilder builder = factory.newDocumentBuilder();
                    Document doc = builder.parse(method.getResponseBodyAsStream());
                    XPathUtils xpath = new XPathUtils(new ConfigurableNamespaceContext());
                    NodeList results = xpath.getNodeList(doc, "/rss/channel/item");
                    int numberOfResults = results.getLength();

                    for (int i = 0; i < results.getLength(); i++) {
                        Node node = results.item(i);
                        String fileIdentifier = xpath.getString(node, ".//*/fileIdentifier/CharacterString");
                        if (!resultMap.containsKey(fileIdentifier)) {
                            resultMap.put(fileIdentifier, new HashMap<String, String>());
                        }
                        Map<String, String> currentMap = resultMap.get(fileIdentifier);
                        currentMap.put("uuid", fileIdentifier);
                        currentMap.put("partner", xpath.getString(node, "partner"));
                        currentMap.put("provider", xpath.getString(node, "provider"));
                        currentMap.put("udk-class", xpath.getString(node, "udk-class"));
                        currentMap.put("source", xpath.getString(node, "source"));
                        currentMap.put("url", new URL(xpath.getString(node, "link")).toString());
                        currentMap.put("title", xpath.getString(node, ".//*/title/CharacterString"));
                        currentMap.put("description", xpath.getString(node, ".//*/abstract/CharacterString"));
                        Node addressNode = xpath.getNode(node, ".//*/contact/idfResponsibleParty");
                        String addressString = "";
                        String tmp = xpath.getString(addressNode, "indiviualName/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + "\n";

                        tmp = xpath.getString(addressNode, "organisationName/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/address/CI_Address/deliveryPoint/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/address/CI_Address/postalCode/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + " ";
                        tmp = xpath.getString(addressNode,
                                "ontactInfo/CI_Contact/address/CI_Address/city/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/address/CI_Address/country/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/address/CI_Address/electronicMailAddress/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += "Email: " + tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/phone/CI_Telephone/voice/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += "Tel: " + tmp + "\n";

                        tmp = xpath.getString(addressNode,
                                "contactInfo/CI_Contact/phone/CI_Telephone/facsimile/CharacterString");
                        if (tmp != null && tmp.length() > 0)
                            addressString += "Fax: " + tmp + "\n";

                        currentMap.put("pointOfContact", addressString);
                    }
                    if (numberOfResults > 0 && numberOfResults >= PAGE_SIZE) {
                        currentPage++;
                    } else {
                        moreResults = false;
                    }
                } else {
                    moreResults = false;
                }
            }

        }

        StringWriter sw = new StringWriter();
        ExcelCSVPrinter ecsvp = new ExcelCSVPrinter(sw);
        boolean fieldsWritten = false;
        for (String key : resultMap.keySet()) {
            Map<String, String> result = resultMap.get(key);
            if (!fieldsWritten) {
                for (String field : result.keySet()) {
                    ecsvp.print(field);
                }
                ecsvp.println("");
                fieldsWritten = true;
            }
            for (String value : result.values()) {
                ecsvp.print(value);
            }
            ecsvp.println("");
        }

        PrintWriter out = new PrintWriter("result.csv");
        out.write(sw.toString());
        out.close();
        in.close();

        System.out.println("Done.");
    }

    public static void main(String[] args) throws Exception {

        AnalysePortalUData app = new AnalysePortalUData();
        app.analyse();
    }

}