Java tutorial
/* * Copyright (C) 2016 Behrang QasemiZadeh <zadeh at phil.hhu.de> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package ie.pars.nlp.sketchengine.interactions; import ie.pars.noske.json.parsers.WordlistMethodJsonParser; import ie.pars.noske.parse.obj.FrequencyLine; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.httpclient.HttpClient; import org.json.JSONObject; /** * * * @author Behrang QasemiZadeh */ public class WordlistSKEInteraction extends SKEInteractionsBase { private final int async = 0; private int countItemFetched; private double totalFreq; private final String runCGIPath; private final boolean writeParsed; final private File fileOutput; private final int MAX_DISPLAYED = 5000000; final private String wlnums; //type of frequencyt final private int wlminfreq; public WordlistSKEInteraction(File file, String rootURL, String runCGIPath, String corpus, String query, String wlnums, int wlminfreq) throws Exception { super(rootURL, corpus, query); this.fileOutput = file; this.runCGIPath = runCGIPath; if (wlnums.equals("frq") || wlnums.equals("docf") || wlnums.equals("arf")) { this.wlnums = wlnums; } else { throw new Exception("This type of frequency is not supported"); } totalFreq = 0.0; countItemFetched = 0; this.wlminfreq = wlminfreq; this.writeParsed = true; } private String encodeFreqQuery(int fromPage) throws UnsupportedEncodingException { String wlpageBit = ""; String secondAndAfter = runCGIPath + "/wordlist?" + "keywords=;" + "include_nonwords=1;" + "wlattr=" + this.query + ";" + "corpname=" + URLEncoder.encode(this.corpus, "UTF-8") + ";" + "wlnums=" + wlnums + ";" //only one of the values of frq, docf, arf + "wlminfreq=" + wlminfreq + ";" + "wlmaxfreq=0;" + "wlmaxitems=" + MAX_DISPLAYED + ";" //{"q": "%5Blemma%3D%3D%22most%22%5D", "freq": 116, "str": "most"} + "wlpage=" + fromPage + ";" + "wlsort=f;" //sort by frequency + "wlpat=.*" + "&async=" + async + "&format=json"; // System.out.println("Request frequency list of all iterms: " + URLDecoder.decode(secondAndAfter, "UTF-8")); return secondAndAfter; } @Override public void run() { System.out.println("\tQ: " + this.query); if (this.fileOutput != null) { try { getItemFrequencyList(); } catch (Exception ex) { System.err.println(ex); Logger.getLogger(WordlistSKEInteraction.class.getName()).log(Level.SEVERE, null, ex); } } else { try { // this.getFrequencyContext(); System.err.println("NOT IMPLEMENT YET"); } catch (Exception ex) { System.err.println(ex); //Logger.getLogger(FreqSKEInteraction.class.getName()).log(Level.SEVERE, null, ex); } } } /** * Get all the results and dump it into files this method can be changed so * Here the difference is that all the res are written into one * * The same parser as freq list can be used here too * * @throws UnsupportedEncodingException * @throws IOExceptionsket * @throws Exception */ public void getItemFrequencyList() throws UnsupportedEncodingException, IOException, Exception { HttpClient sessionID = super.getSessionID(); BufferedWriter writer;// this.writer; OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(this.fileOutput), StandardCharsets.UTF_8); writer = new BufferedWriter(outputStreamWriter); if (!writeParsed) { writer.append("{\"results\": [\n"); } int pageNumer = 1; while (true) { JSONObject jsonObjP = super.getHTTP(sessionID, encodeFreqQuery(pageNumer)); //System.out.println(jsonObjP); if (!writeParsed) { writer.append(jsonObjP.toString(1)); } else { WordlistMethodJsonParser fjpm = new WordlistMethodJsonParser(jsonObjP.toString()); FrequencyLine fl; while ((fl = fjpm.getNext()) != null) { totalFreq += fl.getFreq(); countItemFetched++; writer.append(fl.toStringLine()).append("\n"); } // writer.flush(); } boolean hasError = jsonObjP.has("error"); if (hasError) { String message = (String) jsonObjP.get("error"); if ("Empty list".equals(message)) { System.out.println("No result for current query: " + this.query); // retrun null etc break; } else { System.out.println("* NOT SEEN * " + jsonObjP.toString(1)); throw new Exception("not seen " + jsonObjP.toString(1)); } } else { int isLastPage = 0; // int finished = (int) jsonObjP.get("finished"); if (jsonObjP.has("lastpage")) { isLastPage = (int) jsonObjP.get("lastpage"); //System.out.println("** IS Last TO GO " + isLastPage); if (isLastPage == 0) { if (!writeParsed) { writer.append(","); } pageNumer++; } else { //System.out.println("Going to break because last page is not 0"); break; } } } } if (!writeParsed) { writer.append("]" + "}"); // to the end the json file} } writer.flush(); writer.close(); } public int getCountItemFetched() { return countItemFetched; } public double getTotalFreq() { return totalFreq; } }