Java tutorial
/* * This file is part of CoAnSys project. * Copyright (c) 2012-2013 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.coansys.kwdextraction; import java.io.*; import java.nio.charset.Charset; import java.text.BreakIterator; import java.util.Map.Entry; import java.util.*; import org.apache.commons.io.IOUtils; import org.jdom.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.edu.icm.cermine.DocumentTextExtractor; import pl.edu.icm.cermine.PdfNLMContentExtractor; import pl.edu.icm.cermine.PdfRawTextExtractor; import pl.edu.icm.cermine.exception.AnalysisException; import pl.edu.icm.coansys.kwdextraction.langident.LanguageIdentifierBean; import pl.edu.icm.coansys.models.DocumentProtos; import pl.edu.icm.coansys.models.DocumentProtos.TextWithLanguage; import pl.edu.icm.coansys.models.constants.ProtoConstants; /** * Implementation of Rapid Automatic Keyword Extraction algorithm * * @author Artur Czeczko <a.czeczko@icm.edu.pl> */ public class RakeExtractor { private enum Lang { // language code, stopwords path PL("pl", "stopwords/stopwords_pl.txt"), FR("fr", "stopwords/stopwords_fr.txt"), EN("en", "stopwords/stopwords_en.txt"); private String langCode; private String stopwordsPath; Lang(String langCode, String stopwordsPath) { this.langCode = langCode; this.stopwordsPath = stopwordsPath; } } private enum ExtractionOption { CONTENT(true, false), ABSTRACT(false, true), CONTENT_AND_ABSTRACT(true, true); private boolean fromContent; private boolean fromAbstract; ExtractionOption(boolean fromContent, boolean fromAbstract) { this.fromContent = fromContent; this.fromAbstract = fromAbstract; } } private static final Logger logger = LoggerFactory.getLogger(RakeExtractor.class); private static final String ILLEGAL_CHARS = "[^\\p{L}0-9-'\\s]"; private static final int DEFAULT_KEYWORDS_NUMBER = 8; private static final Map<Lang, Set<String>> stopwords; private String content; private Lang lang; private ExtractionOption extractionOption; private List<KeywordCandidate> keywordCandidates; private Map<String, Map<String, Integer>> cooccurrences; static { try { stopwords = new EnumMap<Lang, Set<String>>(Lang.class); for (Lang l : Lang.values()) { stopwords.put(l, loadStopwords(l)); } } catch (IOException ex) { logger.error("Unable to load stopwords: " + ex); throw new IllegalArgumentException(ex); } } /** * Every constructor sets this.content (document's content) and calls * prepareToExtraction() * * @param content Document's content as a String * @param langCode Document's language (texts in other languages will be * ignored) * @throws IOException */ public RakeExtractor(String content, String langCode) throws IOException { setLang(langCode); this.content = filterTextByLang(content, lang.langCode); prepareToExtraction(); } /** * Every constructor sets this.content (document's content) and calls * prepareToExtraction() * * @param pdfContent Byte array containing a PDF file * @param langCode Document's language (texts in other languages will be * ignored) * @throws AnalysisException * @throws IOException */ public RakeExtractor(byte[] pdfContent, String langCode) throws AnalysisException, IOException { setLang(langCode); content = extractTextFromPdf(pdfContent, this.lang); prepareToExtraction(); } /** * Every constructor sets this.content (document's content) and calls * prepareToExtraction() * * @param docWrapper Protocol buffers message containing document * @param option specifies which parts of the document are searched while * extracting keywords. Possible values: ABSTRACT - only the abstract is * processed, CONTENT - on the body of the document is processed, * CONTENT_AND_ABSTRACT - both abstract and body are processed. * @throws IOException */ public RakeExtractor(DocumentProtos.DocumentWrapper docWrapper, String option, String langCode) throws IOException { setLang(langCode); setOption(option); StringBuilder sb = new StringBuilder(); if (extractionOption.fromContent) { for (DocumentProtos.Media media : docWrapper.getMediaContainer().getMediaList()) { if (media.getMediaType().equals(ProtoConstants.mediaTypePdf)) { try { sb.append(extractTextFromPdf(media.getContent().toByteArray(), this.lang)); } catch (Exception ex) { logger.error( "Cannot extract text from PDF: " + ex.toString() + " " + media.getSourcePath()); } } else if (media.getMediaType().equals(ProtoConstants.mediaTypeTxt)) { sb.append(filterTextByLang(media.getContent().toStringUtf8(), lang.langCode)); } sb.append("\n"); } } if (extractionOption.fromAbstract) { for (TextWithLanguage documentAbstract : docWrapper.getDocumentMetadata().getDocumentAbstractList()) { sb.append(filterTextByLang(documentAbstract.getText(), lang.langCode)); } } content = sb.toString(); prepareToExtraction(); } /** * Extract text from pdf stream * * @param pdfContent content of pdf file * @param lang Document's language (texts in other languages will be * ignored) * @return String object containing document content * @throws IOException * @throws AnalysisException */ private String extractTextFromPdf(byte[] pdfContent, Lang lang) throws IOException, AnalysisException { String result; InputStream pdfStream = new ByteArrayInputStream(pdfContent); PdfNLMContentExtractor nextr = new PdfNLMContentExtractor(); Element contentEl = nextr.extractContent(pdfStream); Element bodyEl = contentEl.getChild("body"); result = bodyEl.getValue(); if (result == null || result.isEmpty()) { pdfStream = new ByteArrayInputStream(pdfContent); DocumentTextExtractor<String> extr = new PdfRawTextExtractor(); result = extr.extractText(pdfStream); } return filterTextByLang(result, lang.langCode); } /** * Returns a text only if it is in given language * * @param text * @param language * @return text or empty String * @throws IOException */ private String filterTextByLang(String text, String language) throws IOException { LanguageIdentifierBean li = new LanguageIdentifierBean(); return (language.equals(li.classify(text))) ? text : ""; } /** * All steps of keyword extraction. Not to be called before setting of * this.content, this.lang and this.option. * * @throws IOException */ private void prepareToExtraction() throws IOException { extractKeywordCandidates(); countCooccurrences(); countMetrics(); } /** * Loading stopwords from a file * * @param lang Stopwords language * @return Set of stopwords * @throws IOException */ private static Set<String> loadStopwords(Lang lang) throws IOException { Set<String> result = new HashSet<String>(); InputStream stopwordsStream; InputStreamReader isr; BufferedReader br = null; stopwordsStream = RakeExtractor.class.getClassLoader().getResourceAsStream(lang.stopwordsPath); try { isr = new InputStreamReader(stopwordsStream, Charset.forName("UTF-8")); br = new BufferedReader(isr); String stopword = br.readLine(); while (stopword != null) { stopword = stopword.trim(); if (!stopword.isEmpty()) { result.add(stopword); } stopword = br.readLine(); } } finally { IOUtils.closeQuietly(br); } return result; } /** * Finding words or word sequences separated by stopwords, punctuation marks * etc. */ private void extractKeywordCandidates() { Map<String, KeywordCandidate> candidatesMap = new HashMap<String, KeywordCandidate>(); BreakIterator wordIterator = BreakIterator.getWordInstance(); wordIterator.setText(content); int wordStart = wordIterator.first(); int candidateStart = wordStart; String candidateStr = null; KeywordCandidate kwdCand = new KeywordCandidate(); for (int wordEnd = wordIterator .next(); wordEnd != BreakIterator.DONE; wordStart = wordEnd, wordEnd = wordIterator.next()) { String word = content.substring(wordStart, wordEnd).trim().toLowerCase(); String alpha = word.replaceAll(ILLEGAL_CHARS, ""); if (!word.isEmpty()) { if (stopwords.get(lang).contains(word) || word.matches("\\W+") || isNum(word) || !word.equals(alpha)) { candidateStr = content.substring(candidateStart, wordStart); } else { kwdCand.addWord(word); if (wordEnd == content.length()) { candidateStr = content.substring(candidateStart, wordEnd); } } if (candidateStr != null) { candidateStr = candidateStr.trim().toLowerCase().replaceAll(ILLEGAL_CHARS, "") .replaceAll("\\s+", " "); if (!candidateStr.isEmpty()) { if (candidatesMap.containsKey(candidateStr)) { candidatesMap.get(candidateStr).incCounter(); } else { kwdCand.setKeyword(candidateStr); candidatesMap.put(candidateStr, kwdCand); } } candidateStr = null; candidateStart = wordEnd; kwdCand = new KeywordCandidate(); } } } keywordCandidates = new ArrayList<KeywordCandidate>(); for (Entry<String, KeywordCandidate> e : candidatesMap.entrySet()) { keywordCandidates.add(e.getValue()); } } /** * Calculate a matrix with words cooccurrences in keyword candidates. */ private void countCooccurrences() { cooccurrences = new HashMap<String, Map<String, Integer>>(); for (KeywordCandidate cand : keywordCandidates) { for (String word : cand.getWords()) { Map<String, Integer> submap; if (cooccurrences.containsKey(word)) { submap = cooccurrences.get(word); } else { submap = new HashMap<String, Integer>(); cooccurrences.put(word, submap); } for (String coword : cand.getWords()) { int count = cand.getCounter(); if (submap.containsKey(coword)) { count += submap.get(coword) * cand.getCounter(); } submap.put(coword, count); } } } } /** * Counts deg/freq for every words and for keyword candidates. */ private void countMetrics() { Map<String, Double> wordScore = new HashMap<String, Double>(); for (String word : cooccurrences.keySet()) { //deg and freq int degValue = 0; for (String coword : cooccurrences.get(word).keySet()) { degValue += cooccurrences.get(word).get(coword); } int freqValue = cooccurrences.get(word).get(word); //wordScore = deg/freq wordScore.put(word, 1.0 * degValue / freqValue); } for (KeywordCandidate cand : keywordCandidates) { double score = 0; for (String word : cand.getWords()) { score += wordScore.get(word); } cand.setScore(score); } Collections.sort(keywordCandidates, new KeywordCandidate.ScoreComparator()); } /** * Returns n best keywords from keyword candidates. * * @param n * @return */ private List<String> choiceKeywords(int n) { int resultSize = Math.min(n, keywordCandidates.size()); List<String> result = new ArrayList<String>(); for (int i = 0; i < resultSize; i++) { result.add(keywordCandidates.get(i).getKeyword()); } return result; } /** * Returns extracted keywords. * * @return */ public List<String> getKeywords() { return choiceKeywords(DEFAULT_KEYWORDS_NUMBER); } /** * Returns n best extracted keywords. * * @param n * @return */ public List<String> getKeywords(int n) { return choiceKeywords(n); } private void setLang(String langCode) { if ("fr".equals(langCode)) { this.lang = Lang.FR; } else if ("pl".equals(langCode)) { this.lang = Lang.PL; } else { this.lang = Lang.EN; } } private void setOption(String option) { this.extractionOption = ExtractionOption.valueOf(option); } /** * Checks if s is a number. * * @param s * @return */ private static boolean isNum(String s) { try { Double.parseDouble(s); } catch (NumberFormatException nfe) { return false; } return true; } public static List<String> getSupportedLanguages() { List<String> result = new ArrayList<String>(); for (Lang l : Lang.values()) { result.add(l.langCode); } return result; } public static List<String> getAvailableExtractionOptions() { List<String> result = new ArrayList<String>(); for (ExtractionOption opt : ExtractionOption.values()) { result.add(opt.name()); } return result; } }