ie.pars.nlp.sketchengine.interactions.WordlistSKEInteraction.java Source code

Java tutorial

Introduction

Here is the source code for ie.pars.nlp.sketchengine.interactions.WordlistSKEInteraction.java

Source

/* 
 * Copyright (C) 2016 Behrang QasemiZadeh <zadeh at phil.hhu.de>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package ie.pars.nlp.sketchengine.interactions;

import ie.pars.noske.json.parsers.WordlistMethodJsonParser;
import ie.pars.noske.parse.obj.FrequencyLine;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.HttpClient;
import org.json.JSONObject;

/**
 *
 *
 * @author Behrang QasemiZadeh
 */
public class WordlistSKEInteraction extends SKEInteractionsBase {

    private final int async = 0;

    private int countItemFetched;
    private double totalFreq;
    private final String runCGIPath;
    private final boolean writeParsed;

    final private File fileOutput;
    private final int MAX_DISPLAYED = 5000000;
    final private String wlnums; //type of frequencyt
    final private int wlminfreq;

    public WordlistSKEInteraction(File file, String rootURL, String runCGIPath, String corpus, String query,
            String wlnums, int wlminfreq) throws Exception {
        super(rootURL, corpus, query);

        this.fileOutput = file;
        this.runCGIPath = runCGIPath;
        if (wlnums.equals("frq") || wlnums.equals("docf") || wlnums.equals("arf")) {
            this.wlnums = wlnums;
        } else {
            throw new Exception("This type of frequency is not supported");
        }
        totalFreq = 0.0;
        countItemFetched = 0;
        this.wlminfreq = wlminfreq;
        this.writeParsed = true;

    }

    private String encodeFreqQuery(int fromPage) throws UnsupportedEncodingException {

        String wlpageBit = "";
        String secondAndAfter = runCGIPath + "/wordlist?" + "keywords=;" + "include_nonwords=1;" + "wlattr="
                + this.query + ";" + "corpname=" + URLEncoder.encode(this.corpus, "UTF-8") + ";" + "wlnums="
                + wlnums + ";" //only one of the values of frq, docf, arf
                + "wlminfreq=" + wlminfreq + ";" + "wlmaxfreq=0;" + "wlmaxitems=" + MAX_DISPLAYED + ";" //{"q": "%5Blemma%3D%3D%22most%22%5D", "freq": 116, "str": "most"}
                + "wlpage=" + fromPage + ";" + "wlsort=f;" //sort by frequency
                + "wlpat=.*" + "&async=" + async + "&format=json";
        // System.out.println("Request frequency list of all iterms: " + URLDecoder.decode(secondAndAfter, "UTF-8"));
        return secondAndAfter;

    }

    @Override
    public void run() {
        System.out.println("\tQ: " + this.query);

        if (this.fileOutput != null) {
            try {
                getItemFrequencyList();
            } catch (Exception ex) {
                System.err.println(ex);
                Logger.getLogger(WordlistSKEInteraction.class.getName()).log(Level.SEVERE, null, ex);
            }

        } else {
            try {
                //  this.getFrequencyContext();
                System.err.println("NOT IMPLEMENT YET");
            } catch (Exception ex) {
                System.err.println(ex);
                //Logger.getLogger(FreqSKEInteraction.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    /**
     * Get all the results and dump it into files this method can be changed so
     * Here the difference is that all the res are written into one
     *
     * The same parser as freq list can be used here too
     *
     * @throws UnsupportedEncodingException
     * @throws IOExceptionsket
     * @throws Exception
     */
    public void getItemFrequencyList() throws UnsupportedEncodingException, IOException, Exception {
        HttpClient sessionID = super.getSessionID();

        BufferedWriter writer;// this.writer;

        OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(this.fileOutput),
                StandardCharsets.UTF_8);
        writer = new BufferedWriter(outputStreamWriter);
        if (!writeParsed) {
            writer.append("{\"results\": [\n");
        }
        int pageNumer = 1;
        while (true) {
            JSONObject jsonObjP = super.getHTTP(sessionID, encodeFreqQuery(pageNumer));
            //System.out.println(jsonObjP);
            if (!writeParsed) {
                writer.append(jsonObjP.toString(1));
            } else {
                WordlistMethodJsonParser fjpm = new WordlistMethodJsonParser(jsonObjP.toString());
                FrequencyLine fl;

                while ((fl = fjpm.getNext()) != null) {
                    totalFreq += fl.getFreq();
                    countItemFetched++;
                    writer.append(fl.toStringLine()).append("\n");

                }
                //   writer.flush();

            }
            boolean hasError = jsonObjP.has("error");
            if (hasError) {
                String message = (String) jsonObjP.get("error");
                if ("Empty list".equals(message)) {
                    System.out.println("No result for current query: " + this.query); // retrun null etc
                    break;
                } else {
                    System.out.println("* NOT SEEN * " + jsonObjP.toString(1));
                    throw new Exception("not seen " + jsonObjP.toString(1));
                }
            } else {
                int isLastPage = 0;

                //                int finished = (int) jsonObjP.get("finished");
                if (jsonObjP.has("lastpage")) {
                    isLastPage = (int) jsonObjP.get("lastpage");
                    //System.out.println("** IS Last TO GO  " + isLastPage);
                    if (isLastPage == 0) {
                        if (!writeParsed) {
                            writer.append(",");
                        }
                        pageNumer++;
                    } else {
                        //System.out.println("Going to break because last page is not 0");
                        break;
                    }
                }

            }
        }
        if (!writeParsed) {
            writer.append("]" + "}"); // to the end the json file}
        }

        writer.flush();
        writer.close();

    }

    public int getCountItemFetched() {
        return countItemFetched;
    }

    public double getTotalFreq() {
        return totalFreq;
    }

}