htmlwordtag.HtmlWordTag.java Source code

Java tutorial

Introduction

Here is the source code for htmlwordtag.HtmlWordTag.java

Source

package htmlwordtag;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.FileRequestEntity;
import org.apache.commons.httpclient.methods.PostMethod;
import org.openrdf.OpenRDFException;
import org.openrdf.model.BNode;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.model.ValueFactory;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.query.BindingSet;
import org.openrdf.query.MalformedQueryException;
import org.openrdf.query.QueryEvaluationException;
import org.openrdf.query.QueryLanguage;
import org.openrdf.query.TupleQuery;
import org.openrdf.query.TupleQueryResult;
import org.openrdf.repository.Repository;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.repository.sparql.SPARQLRepository;
import org.openrdf.rio.RDFFormat;
import org.openrdf.sail.memory.MemoryStore;

public class HtmlWordTag {

    private static final String CALAIS_URL = "http://api.opencalais.com/tag/rs/enrich";

    private File input;
    private File output;
    private HttpClient client;

    private PostMethod createPostMethod() {

        PostMethod method = new PostMethod(CALAIS_URL);

        // Set mandatory parameters
        method.setRequestHeader("x-calais-licenseID", "guuv7mvdatyk3d5yy6xab8d3");

        // Set input content type
        //method.setRequestHeader("Content-Type", "text/xml; charset=UTF-8");
        method.setRequestHeader("Content-Type", "text/html; charset=UTF-8");
        //method.setRequestHeader("Content-Type", "text/raw; charset=UTF-8");

        // Set response/output format
        method.setRequestHeader("Accept", "xml/rdf");
        //method.setRequestHeader("Accept", "application/json");

        // Enable Social Tags processing
        method.setRequestHeader("enableMetadataType", "SocialTags");

        return method;
    }

    private void run() {
        try {
            if (input.isFile()) {
                postFile(input, createPostMethod());
            } else if (input.isDirectory()) {
                System.out.println("working on all files in " + input.getAbsolutePath());
                for (File file : input.listFiles()) {
                    if (file.isFile())
                        postFile(file, createPostMethod());
                    else
                        System.out.println("skipping " + file.getAbsolutePath());
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private void doRequest(File file, PostMethod method) {
        try {
            int returnCode = client.executeMethod(method);
            if (returnCode == HttpStatus.SC_NOT_IMPLEMENTED) {
                System.err.println("The Post method is not implemented by this URI");
                // still consume the response body
                method.getResponseBodyAsString();
            } else if (returnCode == HttpStatus.SC_OK) {
                System.out.println("File post succeeded: " + file);
                saveResponse(file, method);
            } else {
                System.err.println("File post failed: " + file);
                System.err.println("Got code: " + returnCode);
                System.err.println("response: " + method.getResponseBodyAsString());
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            method.releaseConnection();
        }
    }

    private void saveResponse(File file, PostMethod method) throws IOException {
        PrintWriter writer = null;
        try {
            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(method.getResponseBodyAsStream(), "UTF-8"));
            File out = new File(output, file.getName() + ".xml");
            writer = new PrintWriter(new BufferedWriter(new FileWriter(out)));
            String line;
            while ((line = reader.readLine()) != null) {
                writer.println(line);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (writer != null)
                try {
                    writer.close();
                } catch (Exception ignored) {
                }
        }
    }

    private void postFile(File file, PostMethod method) throws IOException {
        method.setRequestEntity(new FileRequestEntity(file, null));
        doRequest(file, method);
    }

    private static void loadhtml() {
        String current = System.getProperty("user.dir");
        System.out.println("Current working directory in Java : " + current);

        File dir = new File(current + "\\input");
        if (!dir.exists()) {
            if (dir.mkdir()) {
                System.out.println("Directory is created!");
            } else {
                System.out.println("Failed to create directory!");
            }
        }

        File f = null;
        try {
            // create new file
            f = new File(current + "\\input\\website1.html");

            // tries to create new file in the system
            if (f.exists()) {
                if (f.delete()) {
                    System.out.println("file Website1.html is already exist.");
                    System.out.println("file Website1.html has been delete.");
                    if (f.createNewFile()) {
                        System.out.println("create Webstie1.html success");
                    } else {
                        System.out.println("fail to create Webstie1.html");
                    }
                } else {
                    System.out.println("fail to delete Website1.html.");
                }
            } else {
                if (f.createNewFile()) {
                    System.out.println("create Webstie1.html success");
                } else {
                    System.out.println("fail to create Webstie1.html");
                }
            }

            PrintWriter outputStream = null;
            try {
                outputStream = new PrintWriter(new FileOutputStream(current + "\\input\\website1.html"));
            } catch (FileNotFoundException e) {
                System.out.println("Error to find file webstie1.html");
                System.exit(0);
            }
            URL website = new URL(
                    "http://www.engadget.com/2014/11/23/hoverboard-cardboard-robots-toyota-prius-camper/");
            URLConnection connection = website.openConnection();
            BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));

            StringBuilder response = new StringBuilder();
            String inputLine;

            while ((inputLine = in.readLine()) != null)
                response.append(inputLine);

            in.close();
            outputStream.println(response);
            outputStream.close();

        } catch (Exception e) {
            e.printStackTrace();
        }

        File f2 = null;
        try {
            // create new file
            f2 = new File(current + "\\input\\website2.html");

            // tries to create new file in the system
            if (f2.exists()) {
                if (f2.delete()) {
                    System.out.println("file Website2.html is already exist.");
                    System.out.println("file Website2.html has been delete.");
                    if (f.createNewFile()) {
                        System.out.println("create Webstie2.html success");
                    } else {
                        System.out.println("fail to create Webstie2.html");
                    }
                } else {
                    System.out.println("fail to delete Website2.html.");
                }
            } else {
                if (f2.createNewFile()) {
                    System.out.println("create Webstie2.html success");
                } else {
                    System.out.println("fail to create Webstie2.html");
                }
            }

            PrintWriter outputStream2 = null;
            try {
                outputStream2 = new PrintWriter(new FileOutputStream(current + "\\input\\website2.html"));
            } catch (FileNotFoundException e) {
                System.out.println("Error to find file webstie2.html");
                System.exit(0);
            }
            URL website = new URL("http://www.engadget.com/2014/11/21/net-neutrality-lawsuit/");
            URLConnection connection = website.openConnection();
            BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));

            StringBuilder response = new StringBuilder();
            String inputLine;

            while ((inputLine = in.readLine()) != null)
                response.append(inputLine);

            in.close();
            outputStream2.println(response);
            outputStream2.close();

        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args)
            throws RepositoryException, MalformedQueryException, QueryEvaluationException {
        //get current path
        String current = System.getProperty("user.dir");
        //get html file from internet
        loadhtml();
        //make director for output
        verifyArgs();
        //translate html file to rdf
        HtmlWordTag httpClientPost = new HtmlWordTag();
        httpClientPost.input = new File("input");
        httpClientPost.output = new File("output");
        httpClientPost.client = new HttpClient();
        httpClientPost.client.getParams().setParameter("http.useragent", "Calais Rest Client");

        httpClientPost.run();

        //create main memory repository
        Repository repo = new SailRepository(new MemoryStore());
        repo.initialize();

        File file = new File(current + "\\output\\website1.html.xml");

        RepositoryConnection con = repo.getConnection();
        try {
            con.add(file, null, RDFFormat.RDFXML);
        } catch (OpenRDFException e) {
            // handle exception
        } catch (java.io.IOException e) {
            // handle io exception
        }

        System.out.println(con.isEmpty());

        //query entire repostiory
        String queryString = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
                + "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"
                + "PREFIX c: <http://s.opencalais.com/1/type/em/e/>\n"
                + "PREFIX p: <http://s.opencalais.com/1/pred/>\n"
                + "PREFIX geo: <http://s.opencalais.com/1/type/er/Geo/>\n"

                + "SELECT  distinct ?s ?n\n" + "WHERE {\n" + "{  ?s rdf:type c:Organization.\n"
                + "   ?s p:name ?n.\n}" + "  UNION \n" + "{  ?s rdf:type c:Person.\n" + "   ?s p:name ?n.\n}"
                + "  UNION \n" + "{  ?s rdf:type geo:City.\n" + "   ?s p:name ?n.\n}" + "}";

        //System.out.println(queryString);

        //insert query through sparql repository connection
        TupleQuery tupleQuery = con.prepareTupleQuery(QueryLanguage.SPARQL, queryString);
        TupleQueryResult result = tupleQuery.evaluate();

        File queryresultdir = new File(current + "\\queryresult");
        if (!queryresultdir.exists()) {
            if (queryresultdir.mkdir()) {
                System.out.println("Directory is created!");
            } else {
                System.out.println("Failed to create directory!");
            }
        }

        File queryresult = null;
        try {
            // create new file
            queryresult = new File(current + "\\queryresult\\queryresult1.txt");

            // tries to create new file in the system
            if (queryresult.exists()) {
                if (queryresult.delete()) {
                    System.out.println("file queryresult1.txt is already exist.");
                    System.out.println("file queryresult1.txt has been delete.");
                    if (queryresult.createNewFile()) {
                        System.out.println("create queryresult1.txt success");
                    } else {
                        System.out.println("fail to create queryresult1.txt");
                    }
                } else {
                    System.out.println("fail to delete queryresult1.txt.");
                }
            } else {
                if (queryresult.createNewFile()) {
                    System.out.println("create queryresult1.txt success");
                } else {
                    System.out.println("fail to create queryresult1.txt");
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        }

        try {
            PrintWriter outputStream = null;
            try {
                outputStream = new PrintWriter(new FileOutputStream(current + "\\queryresult\\queryresult1.txt"));
            } catch (FileNotFoundException e) {
                System.out.println("Error to find file queryresult1.txt");
                System.exit(0);
            }
            //go through all triple in sparql repository
            while (result.hasNext()) { // iterate over the result
                BindingSet bindingSet = result.next();
                Value valueOfS = bindingSet.getValue("s");
                Value valueOfN = bindingSet.getValue("n");

                System.out.println(valueOfS + " " + valueOfN);

                outputStream.println(valueOfS + " " + valueOfN);

            }
            outputStream.close();
        } finally {
            result.close();
        }
        //create main memory repository
        Repository repo2 = new SailRepository(new MemoryStore());
        repo2.initialize();

        File file2 = new File(current + "\\output\\website2.html.xml");

        RepositoryConnection con2 = repo2.getConnection();
        try {
            con2.add(file2, null, RDFFormat.RDFXML);
        } catch (OpenRDFException e) {
            // handle exception
        } catch (java.io.IOException e) {
            // handle io exception
        }

        System.out.println(con2.isEmpty());

        //query entire repostiory
        String queryString2 = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n"
                + "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"
                + "PREFIX c: <http://s.opencalais.com/1/type/em/e/>\n"
                + "PREFIX p: <http://s.opencalais.com/1/pred/>\n"
                + "PREFIX geo: <http://s.opencalais.com/1/type/er/Geo/>\n"

                + "SELECT  distinct ?s ?n\n" + "WHERE {\n" + "{  ?s rdf:type c:Organization.\n"
                + "   ?s p:name ?n.\n}" + "  UNION \n" + "{  ?s rdf:type c:Person.\n" + "   ?s p:name ?n.\n}"
                + "  UNION \n" + "{  ?s rdf:type geo:City.\n" + "   ?s p:name ?n.\n}" + "}";

        //System.out.println(queryString2);

        //insert query through sparql repository connection
        TupleQuery tupleQuery2 = con2.prepareTupleQuery(QueryLanguage.SPARQL, queryString2);
        TupleQueryResult result2 = tupleQuery2.evaluate();

        File queryresult2 = null;
        try {
            // create new file
            queryresult2 = new File(current + "\\queryresult\\queryresult2.txt");

            // tries to create new file in the system
            if (queryresult2.exists()) {
                if (queryresult2.delete()) {
                    System.out.println("file queryresult2.txt is already exist.");
                    System.out.println("file queryresult2.txt has been delete.");
                    if (queryresult2.createNewFile()) {
                        System.out.println("create queryresult2.txt success");
                    } else {
                        System.out.println("fail to create queryresult2.txt");
                    }
                } else {
                    System.out.println("fail to delete queryresult2.txt.");
                }
            } else {
                if (queryresult2.createNewFile()) {
                    System.out.println("create queryresult2.txt success");
                } else {
                    System.out.println("fail to create queryresult2.txt");
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        }

        try {
            PrintWriter outputStream2 = null;
            try {
                outputStream2 = new PrintWriter(new FileOutputStream(current + "\\queryresult\\queryresult2.txt"));
            } catch (FileNotFoundException e) {
                System.out.println("Error to find file queryresult2.txt");
                System.exit(0);
            }
            //go through all triple in sparql repository
            while (result2.hasNext()) { // iterate over the result
                BindingSet bindingSet = result2.next();
                Value valueOfS = bindingSet.getValue("s");
                Value valueOfN = bindingSet.getValue("n");

                System.out.println(valueOfS + " " + valueOfN);

                outputStream2.println(valueOfS + " " + valueOfN);

            }
            outputStream2.close();
        } finally {
            result2.close();
        }

    }

    private static void verifyArgs() {

        if (!new File("input").exists())
            usageError("file " + "input" + " doesn't exist");
        File outdir = new File("output");
        if (!outdir.exists() && !outdir.mkdirs())
            usageError("couldn't create output dir");

    }

    private static void usageError(String s) {
        System.err.println(s);
        System.err.println("Usage: java " + (new Object() {
        }.getClass().getEnclosingClass()).getName() + " input_dir output_dir");
        System.exit(-1);
    }

}