Java tutorial
/* * Copyright 2013 SciFY NPO <info@scify.org>. * * This product is part of the NewSum Free Software. * For more information about NewSum visit * * http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * If this code or its output is used, extended, re-engineered, integrated, * or embedded to any extent in another software or hardware, there MUST be * an explicit attribution to this work in the resulting source code, * the packaging (where such packaging exists), or user interface * (where such an interface exists). * The attribution must be of the form "Powered by NewSum, SciFY" */ package org.scify.NewSumServer.Server.Utils; //Switch for english sources //-PathToSources=./data/Sources/v1.0.RSSSourcesEN.txt import gr.demokritos.iit.conceptualIndex.structs.Distribution; import gr.demokritos.iit.jinsect.storage.INSECTDB; import gr.demokritos.iit.jinsect.storage.INSECTFileDB; import gr.demokritos.iit.jinsect.utils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.util.*; import java.util.logging.*; import java.util.regex.Pattern; import org.apache.commons.feedparser.FeedParserException; import org.apache.commons.feedparser.network.NetworkException; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.LockObtainFailedException; import org.scify.NewSumServer.Server.Comms.Communicator; import org.scify.NewSumServer.Server.MachineLearning.classificationModule; import org.scify.NewSumServer.Server.Searching.Indexer; import org.scify.NewSumServer.Server.Sources.ISourceParser; import org.scify.NewSumServer.Server.Sources.RSSSources; import org.scify.NewSumServer.Server.Sources.RssParser; import org.scify.NewSumServer.Server.Storage.IDataStorage; import org.scify.NewSumServer.Server.Storage.InsectFileIO; import org.scify.NewSumServer.Server.Structures.Article; import org.scify.NewSumServer.Server.Structures.Sentence; import org.scify.NewSumServer.Server.Structures.Topic; import org.scify.NewSumServer.Server.Summarisation.ArticleClusterer; import org.scify.NewSumServer.Server.Summarisation.RedundancyRemover; import org.scify.NewSumServer.Server.Summarisation.Summariser; import static org.scify.NewSumServer.Server.Utils.Main.UserDir; /** * * @author George K. <gkiom@scify.org> */ public class Main { @SuppressWarnings("NonConstantLogger") static Logger LOGGER = Logger.getLogger(Main.class.getName()); static final String CustomUserDir = System.getProperty("user.dir"); static final String UserDir = "."; static final String fileSep = System.getProperty("file.separator"); /** * The Path to the Log file */ static String sLogFile = UserDir + fileSep + "data" + fileSep + "Logger" + fileSep + "NewSumServerLog.txt"; /** * The folder where the FileINSECTDB saves */ public static String sBaseDir = UserDir + fileSep + "data" + fileSep + "BaseDir" + fileSep; /** * The file containing the RSS Sources */ public static String sPathToSources = UserDir + fileSep + "data" + fileSep + "Sources" + fileSep + "v1.0.RSSSourcesGR.txt"; /** * The folder where the Indexer Class saves it's data */ public static String sindexPath = UserDir + fileSep + "data" + fileSep + "Indexed" + fileSep; /** * The Folder where the Summariser class stores it's summaries */ public static String sSummaryPath = UserDir + fileSep + "data" + fileSep + "Summaries" + fileSep; /** * The folder where the Clusterer saves the Articles */ public static String sArticlePath = UserDir + fileSep + "data" + fileSep + "Articles" + fileSep; /** * Folder for misc tool files */ public static String sToolPath = UserDir + fileSep + "data" + fileSep + "Tools" + fileSep; /** * The maximum number of sentences returned by the summarizer */ public static int iOutputSize = 10; /** * Default value for the Accepted Article Date until now. If an article * has an older Date in days than this number, it will not be accepted. * Use it on command line with 'ArticleMaxDays' */ public static long iArticleDays = 5L; /** * The path where the classification module stores data */ public static String sClassModPath = UserDir + fileSep + "ClassificationServerModule/"; public static boolean bUseInputDirData = false; /** * True if a run of a certain category must be applied. Defaults to false * (all categories loaded) */ public static boolean bDebugRun = false; /** * Path to the File that holds the data [Category-days]/line, used by Utilities * class to save */ public static final String sPathToCatsPerDaysFile = UserDir + fileSep + "data" + fileSep + "Sources" + fileSep + "DaysPerCategory.txt"; //should not be used in final version, only for testing public static String sSep = " *** "; /** * The plain text summary storage folder (debug) */ protected static String sTxtSumPath = CustomUserDir + fileSep + "data" + fileSep + "txtSummaries" + fileSep; public static Integer threshold; protected static classificationModule clm; private static double NVSThreshold = 0, SSThreshold = 0; //TODO Currently, the ServerConfig.txt that Main class creates is located //at ./data/BaseDir. So if User changes this Dir, the freeService won't work public static void main(String[] args) throws FeedParserException, NetworkException, IOException { //initialize logger Handler h; try { h = new FileHandler(sLogFile); SimpleFormatter f = new SimpleFormatter(); h.setFormatter(f); LOGGER.addHandler(h); LOGGER.setLevel(Level.FINE); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } catch (SecurityException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } //program info System.out.print("NewSumServer Switches:\n\n" + "-BaseDir: The full path to the folder where the storage module stores data\n" + "-PathToSources: The file named RSSSources.txt with it's full path\n" + "\t(e.g. /home/pathtosources/RSSSources.txt)\n" + "-indexPath: The full path to the folder where the Indexer Class stores data\n" + "-SummaryPath: The full path to the folder where the Summarisation package stores data\n" + "-ArticlePath: The full path to the folder where the " + "Summarisation package stores the Clustered Articles\n" + "-ToolPath: The full path to the folder for misc Tools\n" + "-iOutputSize: The max number of Sentences the Summariser prints\n" + "-ArticleMaxDays: The Number of Max Days to Accept an article (until now)\n" + "-useInputDirData: true or false, defaults to false\n" + "-DebugRun: true if you want to run with category switching (for quicker runs)\n\n" + "Example Usage: java -jar NewSumServer.jar -BaseDir=./data/Dir -iOutputSize=50\n\n"); //Parse and check Command Line arguments parseCommandLine(args); //Write Configuration file so that NewSumFreeService reads statics writeConfigFile(); // check if splitterTraining file was changed since the previous run if (SplitterTrainingFileChanged()) { // if so, delete the Model file, in order to get recreated with the new data File sDat = new File(sToolPath + "splitModel.dat"); try { sDat.delete(); LOGGER.log(Level.INFO, "deleted {0} cause splitterTrainer was updated", sDat.toString()); } catch (Exception ex) { LOGGER.log(Level.WARNING, "Could not delete {0}, although file Was Changed. " + "Please do it manually", sDat.toString()); } } //init data storage IDataStorage ids = new InsectFileIO(sBaseDir); // justWaitABit(50000); //init rssSources RSSSources r = new RSSSources(sPathToSources); // initialize sources: read the sources file r.initialize(ids); //get the sources HashMap<String, String> Sources = r.getRssLinks();//link,category //get categories Collection<String> sCategories = r.getCategories(); // TODO: Ignore UNCLASSIFIED CATEGORY ArrayList<String> lCategories = new ArrayList<String>(sCategories); //init rssparser ISourceParser isp = new RssParser(ids, iArticleDays); //DEBUG LINES //get user input List al = new ArrayList(sCategories); ArrayList<String> subSources = null; ArrayList<Article> Articles = new ArrayList<Article>(); String sCurCateg = "0"; if (bDebugRun) { // if only one category needed (quick run) System.out.println("Choose Category by number: \nIf -1, all categories are loaded"); for (int i = 0; i < lCategories.size(); i++) { System.out.println(String.valueOf(i) + ": " + lCategories.get(i)); } Scanner user_input = new Scanner(System.in); sCurCateg = user_input.next(); ///////////// if (Integer.valueOf(sCurCateg) != -1) { subSources = new ArrayList<String>((HashSet<String>) Utilities.getKeysByValue(Sources, (String) al.get(Integer.valueOf(sCurCateg)))); //accept all articles from each category Articles = (ArrayList<Article>) isp.getAllNewsByCategory(subSources, (String) al.get(Integer.valueOf(sCurCateg))); } else if (Integer.valueOf(sCurCateg) == -1) { //get all articles Articles = (ArrayList<Article>) isp.getAllArticles(Sources); } } else { //if all categories by default (no user choosing - normal mode) Articles = (ArrayList<Article>) isp.getAllArticles(Sources); } // check for spam sentences Utilities.checkForPossibleSpam(Articles); //Save Article List to Drive, so that the clusterer loads it isp.saveAllArticles(); //Name: "AllArticles", Category: "feeds" // ArticleClusterer ac = new ArticleClusterer(subArticles, ids, sArticlePath); //get least occurencies of articles // threshold = Utilities.getLeastOccurencies(Articles); // //Train Classification Module // clm = new classificationModule(); // // // //initialize Distribution category set // Distribution<String> dArticleCategory = new Distribution<String>(); // // for (int i = 0; i < Articles.size(); i++) { // if (Articles.get(i).getToWrap()) { // boolean mergeGraph = true; // //increase Distribution set 1.0 // dArticleCategory.increaseValue(Articles.get(i).getCategory(), 1.0); // //check threshold // double dInstanceCount = dArticleCategory.getValue(Articles.get(i).getCategory()); // if (dInstanceCount < threshold) { // // //check mergeGraph threshold --> threshold/2 turn mergeGraph from true to false // if (dInstanceCount > (threshold / 2)) { // mergeGraph = false; // } // clm.feedClassifier(Articles.get(i).getCategory(), Articles.get(i).getText(), mergeGraph); // } // // } // } // Initialize Clusterer ArticleClusterer ac; ac = new ArticleClusterer((ArrayList<Article>) ids.loadObject("AllArticles", "feeds"), ids, sArticlePath); // Perform clustering calculations ac.calculateClusters(NVSThreshold, SSThreshold); //specify the locale for the indexer Locale loc = sPathToSources.endsWith("GR.txt") ? new Locale("el") : new Locale("en"); // Create a new indexer Indexer ind = new Indexer(sArticlePath, sindexPath, loc); // Create the Index try { ind.createIndex(); } catch (CorruptIndexException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } catch (LockObtainFailedException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } // Init the summarizer and obtain summaries INSECTDB idb = new INSECTFileDB("", sSummaryPath); Summariser sum = new Summariser(new HashSet<Topic>(ac.getArticlesPerCluster().values()), idb); // Perform summarization for all clusters Map<String, List<Sentence>> AllSummaries; AllSummaries = sum.getSummaries(); // DEBUG // store summaries // for (Map.Entry mp : AllSummaries.entrySet()) { // String sUID = (String) mp.getKey(); // List<Sentence> lsSen = (List<Sentence>) mp.getValue(); // if (getNumberOfSources(lsSen) > 1) { // writeSummaryToFile(lsSen, sUID, ac.getArticlesPerCluster()); // } // } // DEBUG // if (bDebugRun) { // DEBUG LINES // Delete files in "data/txtSummaries" and write all the summaries extracted File f = new File(sTxtSumPath); if (f.exists()) { for (File k : f.listFiles()) { k.delete(); } } for (Map.Entry mp : AllSummaries.entrySet()) { String sUID = (String) mp.getKey(); List<Sentence> lsSen = (List<Sentence>) mp.getValue(); if (getNumberOfSources(lsSen) > 1) { writeSummaryToFile(lsSen, sUID, ac.getArticlesPerCluster()); } } // debug communicator Communicator cm = new Communicator(ids, ac, sum, ind); int bb = Integer.valueOf(sCurCateg); if (bb == -1) { bb = 0; } // print summary from communicator int iSummarizedClusterCnt = 0; for (Topic tTopic : ac.getArticlesPerCluster().values()) { if (tTopic.size() > 1) { System.out.println("Printing summary for topic: " + tTopic.getTitle()); String[] eachSnippet = cm.getSummary(tTopic.getID(), "All").split(cm.getFirstLevelSeparator()); int iAllTmpSourcesCount = eachSnippet[0].split(cm.getSecondLevelSeparator()).length; System.out.println("With Summary Sources: " + iAllTmpSourcesCount); for (int i = 1; i < eachSnippet.length; i++) { String[] eachSent = eachSnippet[i].split(cm.getSecondLevelSeparator()); System.out.println(eachSent[0]); System.out.println("-----------------------------------"); iSummarizedClusterCnt++; } System.out.println("==========================="); } } } //DEBUG LINES //get user input // System.out.println("Enter Search String\n"); // Scanner imp = new Scanner(System.in); // String term = imp.next(); // // String sTop = cm.getTopicIDsByKeyword(ind, term, "All"); // System.out.println(sTop); // System.out.println(cm.getTopicTitlesByIDs(sTop)); // last debug // String sUserSources = "http://rss.in.gr/feed/news/culture/" + // "http://www.tovima.gr/feed/culture/" + // "http://www.naftemporiki.gr/rssFeed?mode=section&id=6&atype=story"; // System.out.println(cm.getTopics(sUserSources, (String) al.get(bb))); // System.out.println(cm.getTopics("All", (String) al.get(bb))); // last debug // System.out.println("Found a total of " + iSummarizedClusterCnt + " summaries" // + " from more than one texts."); // System.out.println(cm.getTopicIDs("All", (String) al.get(bb))); // System.out.println("===============printing topic titles"); // System.out.println(cm.getTopicTitles("All", (String) al.get(bb))); // System.out.println("===============ending printing topic titles"); // String sUserSources = "http://rss.in.gr/feed/news/world/;;;" // + "http://www.naftemporiki.gr/news/static/rss/news_pol_pol-world.xml;;;" // + "http://ws.kathimerini.gr/xml_files/worldnews.xml;;;" // + "http://feeds.feedburner.com/skai/aqOL?format=xml"; // System.out.println(cm.getTopicIDs(sUserSources, (String) al.get(bb))); // System.out.println(cm.getTopicTitles(sUserSources, (String) al.get(bb))); // int counter = 0; // for (Topic tTopic : ac.getArticlesPerCluster().values()) { // System.out.println("===================="); // System.out.println(cm.getSummary(tTopic.getID(), sUserSources)); // counter ++; // if (counter == 3) { // break; // } // } } private static void parseCommandLine(String[] args) { // Parse command line Hashtable hSwitches; hSwitches = utils.parseCommandLineSwitches(args); sPathToSources = utils.getSwitch(hSwitches, "PathToSources", sPathToSources); sBaseDir = addSuffix(utils.getSwitch(hSwitches, "BaseDir", sBaseDir)); sindexPath = addSuffix(utils.getSwitch(hSwitches, "indexPath", sindexPath)); sSummaryPath = addSuffix(utils.getSwitch(hSwitches, "SummaryPath", sSummaryPath)); sArticlePath = addSuffix(utils.getSwitch(hSwitches, "ArticlePath", sArticlePath)); sToolPath = addSuffix(utils.getSwitch(hSwitches, "ToolPath", sToolPath)); iOutputSize = Integer.valueOf(utils.getSwitch(hSwitches, "outputSize", String.valueOf(iOutputSize))) .intValue(); iArticleDays = Integer.valueOf(utils.getSwitch(hSwitches, "ArticleMaxDays", String.valueOf(iArticleDays))) .intValue(); bUseInputDirData = Boolean.valueOf(utils.getSwitch(hSwitches, "useInputDirData", Boolean.FALSE.toString())); bDebugRun = Boolean.valueOf(utils.getSwitch(hSwitches, "DebugRun", Boolean.FALSE.toString())); NVSThreshold = Double.valueOf(utils.getSwitch(hSwitches, "NVSThreshold", "0.20")); SSThreshold = Double.valueOf(utils.getSwitch(hSwitches, "SSThreshold", "0.10")); //checking user input checkPaths(hSwitches.values().toArray()); // Check Switches } private static String addSuffix(String in) { if (!in.endsWith(fileSep)) { in += fileSep; } return in; } private static void checkPaths(Object[] args) { Iterator iIter = Arrays.asList(args).iterator(); while (iIter.hasNext()) { String sCurSwitch = (String) iIter.next(); if (!sCurSwitch.endsWith(".txt") && !sCurSwitch.equals("true") && !sCurSwitch.equals("false") && !Pattern.matches("[0-9]+\\.*[0-9]*", sCurSwitch)) { File fsw = new File(sCurSwitch); if (!fsw.isDirectory()) { LOGGER.log(Level.WARNING, "Error: {0} is not a directory", fsw); LOGGER.log(Level.INFO, "Trying to create Dir..."); boolean happy = (new File(sCurSwitch)).mkdir(); if (happy) { LOGGER.log(Level.INFO, "Directory {0} created", sCurSwitch); } else { LOGGER.log(Level.WARNING, "Could not create Dir..."); } } } } // check if file for categories exist, else create File sCatsDays = new File(sPathToCatsPerDaysFile); // delete the categories / days to keep file, so that new data // may be appended to it. if (sCatsDays.exists()) { sCatsDays.delete(); } // check if can read Sources File, else abort File SourcesFile = new File(sPathToSources); if (!SourcesFile.exists() || !SourcesFile.canRead()) { LOGGER.log(Level.SEVERE, "{0} does not exist\nAborting...", SourcesFile); System.exit(0); } String[] paths = { sBaseDir, sArticlePath, sSummaryPath, sToolPath, sindexPath }; for (String eachPath : paths) { File chkPath = new File(eachPath); if (!chkPath.isDirectory() || !chkPath.exists()) { LOGGER.log(Level.SEVERE, "{0} is not a directory\nAborting...", chkPath); System.exit(0); } } } private static boolean SplitterTrainingFileChanged() { boolean bChanged = false; String sStartsWith = "FileSize"; String sLocalSep = "="; // get value of current size File fSentenceTrainer = new File( "./src/org/scify/NewSumServer/Server/Summarisation/SentenceSplitterTraining.txt"); long lCurrentSize = fSentenceTrainer.length(); // read file that has stored data about previous file size File fFileSize = new File(sToolPath + "FileSize.txt"); if (fFileSize.exists()) { long lOldFileSize; // read file and get the file size from the previous run if (fFileSize.canRead()) { FileInputStream fstream; try { fstream = new FileInputStream(fFileSize); DataInputStream in = new DataInputStream(fstream); BufferedReader br = new BufferedReader(new InputStreamReader(in)); // Get the object of DataInputStream String sLine; while ((sLine = br.readLine()) != null) { if (sLine.startsWith(sStartsWith)) { // get value of previous file size lOldFileSize = Long.valueOf(sLine.split("=")[1].trim()); // check if different values bChanged = (lCurrentSize != lOldFileSize); // overwrite data with new value BufferedWriter bw = null; bw = new BufferedWriter(new FileWriter(fFileSize, false)); // update data to file bw.append(sStartsWith).append(sLocalSep); bw.append(String.valueOf(lCurrentSize)); bw.close(); } } in.close(); } catch (FileNotFoundException ex) { LOGGER.log(Level.SEVERE, "File Not Found (Exception): {0}", fFileSize.toString()); } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex.getMessage()); } } else { LOGGER.log(Level.SEVERE, "Error: Cannot read from file: {0}", fFileSize.toString()); } } else { try { // create it for the first time fFileSize.createNewFile(); BufferedWriter bw = null; bw = new BufferedWriter(new FileWriter(fFileSize, false)); // write initial data to file bw.append(sStartsWith).append(sLocalSep); bw.append(String.valueOf(lCurrentSize)); bw.close(); } catch (IOException ex) { LOGGER.log(Level.SEVERE, "Could not create file {0}", fFileSize.toString()); } } return bChanged; } private static void writeConfigFile() { HashMap switches = new HashMap<String, String>(); switches.put("BaseDir", sBaseDir); switches.put("PathToSources", sPathToSources); switches.put("indexPath", sindexPath); switches.put("SummaryPath", sSummaryPath); switches.put("ArticlePath", sArticlePath); switches.put("ToolPath", sToolPath); switches.put("sCatsDaysFile", sPathToCatsPerDaysFile); switches.put("useInputDirData", String.valueOf(bUseInputDirData)); switches.put("ArticleMaxDays", String.valueOf(iArticleDays)); switches.put("DebugRun", String.valueOf(bDebugRun)); switches.put("NVSThreshold", String.valueOf(NVSThreshold)); switches.put("SSThreshold", String.valueOf(SSThreshold)); // switches.put("SplitterTraining", String.valueOf(lFileSize)); //write Config File, so that FreeService reads values from it File fConfig = new File(sBaseDir + "ServerConfig.txt"); if (fConfig.exists()) { fConfig.delete(); } try { fConfig.createNewFile(); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } if (fConfig.canWrite()) { try { BufferedWriter bw = null; bw = new BufferedWriter(new FileWriter(fConfig, true)); Iterator Iter = switches.entrySet().iterator(); while (Iter.hasNext()) { Map.Entry tmpS = (Map.Entry) Iter.next(); bw.append((String) tmpS.getKey() + "=" + (String) tmpS.getValue()); bw.append("\n"); } bw.close(); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } } else { try { throw new IOException("Cannot write to file " + fConfig.getName()); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } } } /** * @return The Logger that is used */ public static Logger getLogger() { return LOGGER; } /** * * @return The full path where the sources file is stored */ public static String getPathToSources() { return sPathToSources; } ///DEBUGGING function private static void writeSummaryToFile(List<Sentence> lsSen, String sCluster, HashMap<String, Topic> hsTopics) throws IOException { File f = new File(sTxtSumPath); if (!f.exists()) { System.err.println("FILE " + sTxtSumPath + " DOES NOT EXIST"); if (!f.mkdirs()) { System.err.println("FILE " + sTxtSumPath + " Could not be created"); } } if (f.isDirectory()) { f.setWritable(true); } String sFullFileName = sTxtSumPath + sCluster + ".txt"; File fFile = new File(sFullFileName); fFile.createNewFile(); BufferedWriter bw = new BufferedWriter(new FileWriter(fFile, false)); bw.write("ClusterID" + sSep + sCluster); bw.newLine(); bw.write("Title: " + hsTopics.get(sCluster).getTitle()); bw.write(("\n========================================\n")); StringBuilder sb = new StringBuilder(); ListIterator<Sentence> li = lsSen.listIterator(); while (li.hasNext()) { Sentence sCur = li.next(); if (sCur.getSnippet().split("[;,. ]").length < 5) { li.remove(); } } lsSen = new RedundancyRemover().removeRedundantSentences(lsSen); for (Sentence each : lsSen) { sb.append(each.getSnippet()); sb.append("\n========================================\n"); // sb.append(each.getLinkToSource()); // sb.append("\n"); // sb.append(each.getFeed()); // sb.append("\n"); } bw.write(sb.toString()); bw.close(); } private static int getNumberOfSources(List<Sentence> lsSen) { HashSet<String> hsSources = new HashSet<String>(); for (Sentence each : lsSen) { hsSources.add(each.getLinkToSource()); } return hsSources.size(); } /** * * @return The classification module instance */ public static classificationModule getClassificationModule() { return clm; } private static void justWaitABit(int seconds) { long l = seconds * 1000; try { Thread.sleep(l); } catch (InterruptedException ex) { Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); } } }