ie.pars.nlp.sketchengine.interactions.FreqSKEInteraction.java Source code

Java tutorial

Introduction

Here is the source code for ie.pars.nlp.sketchengine.interactions.FreqSKEInteraction.java

Source

/* 
 * Copyright (C) 2016 Behrang QasemiZadeh <zadeh at phil.hhu.de>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package ie.pars.nlp.sketchengine.interactions;

import ie.pars.noske.json.parsers.FreqMethodJsonParser;
import ie.pars.noske.parse.obj.FrequencyLine;
import ie.pars.noske.parse.obj.WLTGD;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.HttpClient;
import org.json.JSONObject;

/**
 * The class constructor are the only methods available to call (apart from
 * start etc.) I think of three 1) get the result and dump into a file 2) get
 * the result, parse and dump into a file 3) get the result and store in kind of
 * object for passing to the next step
 *
 * @author Behrang QasemiZadeh @ One cold winter day
 */
public class FreqSKEInteraction extends SKEInteractionsBase {

    private final int maxPageSize;
    private final SkeJsonResultParser parser;
    private final String fcritQuery;
    private final int minFreqFilter;
    private final int async = 0;
    private final int exampleNumber;
    private final List returnResutlWhatEver;
    private final String runCGIPath;
    private final boolean writeParsed;
    private final boolean appendWritingMode;
    private final Writer writer;
    private final static int DEFAULT_MAX_MAX_PAGE_SIZE = 100000;
    private File fileOutput;

    /**
     *
     * @param writer
     * @param outputFile
     * @param baseURL
     * @param runCGIPath
     * @param corpus
     * @param query
     * @param fcritQuery
     * @param minFreqFilter
     * @param exampleNumner
     * @param writeParsed
     * @param appendWritingMode
     */
    public FreqSKEInteraction(Writer writer, String baseURL, String runCGIPath, String corpus, String query,
            String fcritQuery, int minFreqFilter, int exampleNumner, boolean writeParsed, boolean appendWritingMode)
            throws Exception {
        super(baseURL, corpus, query);
        this.writer = writer;
        this.runCGIPath = runCGIPath;
        this.maxPageSize = DEFAULT_MAX_MAX_PAGE_SIZE;
        this.parser = null;
        this.fcritQuery = fcritQuery;
        this.exampleNumber = exampleNumner;
        this.minFreqFilter = minFreqFilter;
        returnResutlWhatEver = null;
        this.writeParsed = writeParsed;
        this.appendWritingMode = appendWritingMode;
        if (appendWritingMode && !writeParsed) {
            throw new Exception("You can only append parsed data to files ... no json! ");

        }
    }

    /**
     *
     * @param file
     * @param maxPageSize
     * @param parser
     * @param outputFile
     * @param rootURL
     * @param runCGIPath
     * @param corpus
     * @param query
     * @param fcritQuery
     * @param minFreqFilter
     * @param exampleNumner
     * @param writeParsed
     * @param appendWritingMode
     */
    public FreqSKEInteraction(
            //int maxPageSize,
            //SkeJsonResultParser parser,
            File file, String rootURL, String runCGIPath, String corpus, String query, String fcritQuery,
            int minFreqFilter, int exampleNumner, boolean writeParsed, boolean appendWritingMode) throws Exception {
        super(rootURL, corpus, query);
        this.writer = null;

        this.fileOutput = file;
        this.runCGIPath = runCGIPath;
        this.maxPageSize = DEFAULT_MAX_MAX_PAGE_SIZE;
        this.fcritQuery = fcritQuery;
        this.parser = null;
        this.exampleNumber = exampleNumner;
        this.minFreqFilter = minFreqFilter;
        returnResutlWhatEver = null;
        this.writeParsed = writeParsed;
        this.appendWritingMode = appendWritingMode;

        if (appendWritingMode && !writeParsed) {
            throw new Exception("to implement ... the proble was" + " that I have wanted to write in the same "
                    + "file in a multi-threaded setting" + "this constructor is going to be used for the time"
                    + " to implement for the time that this class is do its IO"
                    + " within the thread in its own dedicated IO channel" + " ... no json! ");

        }
    }

    /**
     *
     * @param maxPageSize
     * @param parser
     * @param rootURL
     * @param corpus
     * @param query
     * @param fcritQuery
     * @param minFreqFilter
     * @param exampleNumner
     */
    public FreqSKEInteraction(int maxPageSize, SkeJsonResultParser parser, String rootURL, String corpus,
            String query, String fcritQuery, int minFreqFilter, int exampleNumner) {
        super(rootURL, corpus, query);
        this.maxPageSize = maxPageSize;
        this.fcritQuery = fcritQuery;
        this.parser = parser;
        this.exampleNumber = exampleNumner;
        this.minFreqFilter = minFreqFilter;
        returnResutlWhatEver = new ArrayList();
        this.writeParsed = false;

        throw new UnsupportedOperationException("to be implemented in the future");
        //        if(appendWritingMode&&!writeParsed){
        //            throw new Exception("You can only append parsed data to files ... no json! ");
        //            
        //        }

    }

    private String encodeFreqQuery(int fromPage) throws UnsupportedEncodingException {

        //private String encodeFreqQuery(int fromPage) throws UnsupportedEncodingException {
        //System.out.println(fcritQuery);
        String secondAndAfter = runCGIPath + "/freqs?" + "q=q" + URLEncoder.encode(this.query, "UTF-8") + ";"
                + "corpname=" + URLEncoder.encode(this.corpus, "UTF-8") + ";"
                //+ "fcrit=" + URLEncoder.encode(this.fcritQuery, "UTF-8") +";"
                + "fcrit=" + URLEncoder.encode(this.fcritQuery, "UTF-8") + ";" + "flimit=" + this.minFreqFilter
                + ";" + "fpage=" + fromPage + ";" + "fpage=" + (fromPage + 1) + ";" + "examples="
                + this.exampleNumber + ";" + "&pagesize=" + maxPageSize
                //+ "&fromp=" + fromPage
                + "&async=" + async + "&format=json";
        //  System.out.println(URLDecoder.decode(secondAndAfter, "UTF-8"));
        return secondAndAfter;

    }

    /**
     * Get all the results and dump it into files this method can be changed so
     * that it resembles a stream on demand
     *
     * @param output
     * @param corpusName
     * @param cqlQuery
     * @param contextSizeLeft
     * @throws UnsupportedEncodingException
     * @throws IOExceptionsket
     * @throws Exception
     */
    private void getFrequencyContext() throws UnsupportedEncodingException, IOException, Exception {
        HttpClient sessionID = super.getSessionID();

        if (!writeParsed) {
            writer.append("{\"results\": [\n");
        }
        int pageNumer = 0;
        while (true) {
            JSONObject jsonObjP = super.getHTTP(sessionID, encodeFreqQuery(pageNumer));
            if (!writeParsed) {
                writer.append(jsonObjP.toString(1));
            } else {
                FreqMethodJsonParser fjpm = new FreqMethodJsonParser(jsonObjP.toString());
                FrequencyLine fl;
                synchronized (writer) {
                    while ((fl = fjpm.getNext()) != null) {
                        // just to make sure that the output won't be messed up
                        // I may need to change this to a better solution

                        // at least with the current hardware the writer sis not a bottleneck
                        writer.append(fl.toStringLine()).append("\n");

                    }
                    //   writer.flush();
                }
            }
            boolean hasError = jsonObjP.has("error");
            if (hasError) {
                String message = (String) jsonObjP.get("error");
                if ("Empty list".equals(message)) {
                    System.out.println("No result for current query: " + this.query); // retrun null etc
                    break;
                } else {
                    System.out.println("* NOT SEEN * " + jsonObjP.toString(1));
                    throw new Exception("not seen " + jsonObjP.toString(1));
                }
            } else {
                int isLastPage = 0;

                //                int finished = (int) jsonObjP.get("finished");
                if (jsonObjP.has("lastpage")) {
                    isLastPage = (int) jsonObjP.get("lastpage");
                    //System.out.println("** IS Last TO GO  " + isLastPage);
                    if (isLastPage == 0) {
                        if (!writeParsed) {
                            writer.append(",");
                        }
                        pageNumer++;
                    } else {
                        //System.out.println("Going to break because last page is not 0");
                        break;
                    }
                }

            }
        }
        if (!writeParsed) {
            writer.append("]" + "}"); // to the end the json file}
        }
        writer.flush();
        writer.close();

    }

    /**
     * Parsing method
     *
     * @param parser
     * @param corpusName
     * @param cqlQuery
     * @param contextSizeLeft
     * @param contextSizeRight
     * @return
     * @throws UnsupportedEncodingException
     * @throws IOException
     * @throws Exception
     */
    private List<WLTGD> getParseFreqList() throws UnsupportedEncodingException, IOException, Exception {
        throw new Exception("to implement");

    }

    @Override
    public void run() {
        System.out.println("\tQ: " + this.query + " for " + this.fcritQuery);

        // write everything into one file using one writer
        if (this.writer != null && this.parser == null) {
            try {
                this.getFrequencyContext();
            } catch (Exception ex) {
                System.err.println(ex);

            }
        } // for the case that one context is written to one file
        else if (this.writer == null && this.fileOutput != null) {
            try {
                getFrequencyContextSingle();
            } catch (Exception ex) {
                System.err.println(ex);
                Logger.getLogger(FreqSKEInteraction.class.getName()).log(Level.SEVERE, null, ex);
            }

        } else {
            try {
                //  this.getFrequencyContext();
                System.err.println("NOT IMPLEMENT YET");
            } catch (Exception ex) {
                System.err.println(ex);
                //Logger.getLogger(FreqSKEInteraction.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    /**
     * Get all the results and dump it into files this method can be changed so
     * Here the difference is that all the res are written into one
     *
     *
     * @throws UnsupportedEncodingException
     * @throws IOExceptionsket
     * @throws Exception
     */
    public void getFrequencyContextSingle() throws UnsupportedEncodingException, IOException, Exception {
        HttpClient sessionID = super.getSessionID();

        BufferedWriter writer;// this.writer;
        //append to the end of file
        OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(this.fileOutput),
                StandardCharsets.UTF_8);
        writer = new BufferedWriter(outputStreamWriter);

        if (!writeParsed) {
            writer.append("{\"results\": [\n");
        }
        int pageNumer = 0;
        while (true) {
            JSONObject jsonObjP = super.getHTTP(sessionID, encodeFreqQuery(pageNumer));
            if (!writeParsed) {
                writer.append(jsonObjP.toString(1));
            } else {
                FreqMethodJsonParser fjpm = new FreqMethodJsonParser(jsonObjP.toString());
                FrequencyLine fl;

                while ((fl = fjpm.getNext()) != null) {
                    // just to make sure that the output won't be messed up
                    // I may need to change this to a better solution

                    // at least with the current hardware the writer sis not a bottleneck
                    writer.append(fl.toStringLine()).append("\n");

                }
                //   writer.flush();

            }
            boolean hasError = jsonObjP.has("error");
            if (hasError) {
                String message = (String) jsonObjP.get("error");
                if ("Empty list".equals(message)) {
                    System.out.println("No result for current query: " + this.query); // retrun null etc
                    break;
                } else {
                    System.out.println("* NOT SEEN * " + jsonObjP.toString(1));
                    throw new Exception("not seen " + jsonObjP.toString(1));
                }
            } else {
                int isLastPage = 0;

                //                int finished = (int) jsonObjP.get("finished");
                if (jsonObjP.has("lastpage")) {
                    isLastPage = (int) jsonObjP.get("lastpage");
                    //System.out.println("** IS Last TO GO  " + isLastPage);
                    if (isLastPage == 0) {
                        if (!writeParsed) {
                            writer.append(",");
                        }
                        pageNumer++;
                    } else {
                        //System.out.println("Going to break because last page is not 0");
                        break;
                    }
                }

            }
        }
        if (!writeParsed) {
            writer.append("]" + "}"); // to the end the json file}
        }
        writer.flush();
        writer.close();

    }

}