Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Jochre. // //Jochre is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Jochre is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Jochre. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.jochre.lexicon; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.io.Writer; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Constructs a lexicon from a tab-separated text file resource, * organised as: * word tab frequency. * If there is no tab and frequency, the word will be assumed to have a frequency of 1. * Lines starting with a # will be ignored. * @author Assaf Urieli * */ public class TextFileLexicon implements Lexicon, Serializable { private static final long serialVersionUID = 1278484873657866572L; private static final Log LOG = LogFactory.getLog(TextFileLexicon.class); private Map<String, Integer> entries = new HashMap<String, Integer>(); public TextFileLexicon() { } public TextFileLexicon(Map<String, Integer> entries) { this.entries = entries; } public TextFileLexicon(File textFile) { this(textFile, null); } public TextFileLexicon(File textFile, String charset) { Scanner scanner; try { scanner = new Scanner( new BufferedReader(new InputStreamReader(new FileInputStream(textFile), charset))); try { while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (!line.startsWith("#")) { String[] parts = line.split("\t"); if (parts.length > 0) { String word = parts[0]; int frequency = 1; if (parts.length > 1) frequency = Integer.parseInt(parts[1]); entries.put(word, frequency); } } } } finally { scanner.close(); } } catch (IOException e) { throw new RuntimeException(e); } } public void writeFile(Writer writer) { for (Entry<String, Integer> entry : entries.entrySet()) { try { writer.write(entry.getKey() + "\t" + entry.getValue() + "\n"); } catch (IOException e) { throw new RuntimeException(e); } } } public void incrementEntry(String word) { Integer freqObj = entries.get(word); if (freqObj == null) entries.put(word, 1); else entries.put(word, freqObj.intValue() + 1); } public void setEntry(String word, int frequency) { entries.put(word, frequency); } @Override public int getFrequency(String word) { Integer freqObj = entries.get(word); if (freqObj != null) return freqObj.intValue(); else return 0; } public void serialize(File memoryBaseFile) { LOG.debug("serialize"); boolean isZip = false; if (memoryBaseFile.getName().endsWith(".zip")) isZip = true; FileOutputStream fos = null; ObjectOutputStream out = null; ZipOutputStream zos = null; try { fos = new FileOutputStream(memoryBaseFile); if (isZip) { zos = new ZipOutputStream(fos); zos.putNextEntry(new ZipEntry("lexicon.obj")); out = new ObjectOutputStream(zos); } else { out = new ObjectOutputStream(fos); } try { out.writeObject(this); } finally { out.flush(); out.close(); } } catch (IOException ioe) { throw new RuntimeException(ioe); } } public static TextFileLexicon deserialize(ZipInputStream zis) { TextFileLexicon memoryBase = null; try { ZipEntry zipEntry; if ((zipEntry = zis.getNextEntry()) != null) { LOG.debug("Scanning zip entry " + zipEntry.getName()); ObjectInputStream in = new ObjectInputStream(zis); memoryBase = (TextFileLexicon) in.readObject(); zis.closeEntry(); in.close(); } else { throw new RuntimeException("No zip entry in input stream"); } } catch (IOException ioe) { throw new RuntimeException(ioe); } catch (ClassNotFoundException cnfe) { throw new RuntimeException(cnfe); } return memoryBase; } public static TextFileLexicon deserialize(File memoryBaseFile) { LOG.debug("deserializeMemoryBase"); boolean isZip = false; if (memoryBaseFile.getName().endsWith(".zip")) isZip = true; TextFileLexicon memoryBase = null; ZipInputStream zis = null; FileInputStream fis = null; ObjectInputStream in = null; try { fis = new FileInputStream(memoryBaseFile); if (isZip) { zis = new ZipInputStream(fis); memoryBase = TextFileLexicon.deserialize(zis); } else { in = new ObjectInputStream(fis); memoryBase = (TextFileLexicon) in.readObject(); in.close(); } } catch (IOException ioe) { throw new RuntimeException(ioe); } catch (ClassNotFoundException cnfe) { throw new RuntimeException(cnfe); } return memoryBase; } @Override public Iterator<String> getWords() { return entries.keySet().iterator(); } }