org.apdplat.superword.tools.SentenceScorer.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.superword.tools.SentenceScorer.java

Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */
package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * ??
 *
 * @author ??
 */
public class SentenceScorer {

    private SentenceScorer() {
    }

    private static final Logger LOGGER = LoggerFactory.getLogger(SentenceScorer.class);

    public static TreeMap<Float, Map<String, List<String>>> score(String path) {
        return score(path, Integer.MAX_VALUE);
    }

    public static TreeMap<Float, Map<String, List<String>>> score(String path, int limit) {
        //?  
        Set<String> fileNames = TextAnalyzer.getFileNames(path);
        //?
        Map<String, AtomicInteger> frequency = TextAnalyzer.frequency(fileNames);
        //?
        TreeMap<Float, Map<String, List<String>>> sentences = new TreeMap<>();
        //??????
        Set<Integer> hashes = new HashSet<>();
        Set<String> repeat = new HashSet<>();
        //?????????
        int count = 0;
        for (String fileName : fileNames) {
            try (BufferedReader reader = new BufferedReader(
                    new InputStreamReader(new BufferedInputStream(new FileInputStream(fileName))))) {
                String book = Paths.get(fileName).toFile().getName().replace(".txt", "");
                String line = null;
                while ((line = reader.readLine()) != null) {
                    if (StringUtils.isBlank(line)) {
                        continue;
                    }
                    int hc = line.hashCode();
                    if (hashes.contains(hc)) {
                        repeat.add(line);
                        continue;
                    }
                    hashes.add(hc);
                    //
                    float score = score(line, frequency);
                    if (score > 0) {
                        if (count >= limit) {
                            LOGGER.debug("?????" + limit + "?");
                            return sentences;
                        }
                        count++;
                        sentences.putIfAbsent(score, new HashMap<>());
                        sentences.get(score).putIfAbsent(book, new ArrayList<>());
                        sentences.get(score).get(book).add(line);
                    }
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
        LOGGER.debug("????" + repeat.size());
        AtomicInteger i = new AtomicInteger();
        repeat.forEach(r -> {
            LOGGER.debug("\t" + i.incrementAndGet() + "?" + r);
        });
        LOGGER.debug("???" + count);
        return sentences;
    }

    public static void toTextFile(TreeMap<Float, Map<String, List<String>>> scores, String fileName) {
        LOGGER.debug("" + fileName);
        AtomicInteger bookCount = new AtomicInteger();
        AtomicInteger sentenceCount = new AtomicInteger();
        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(fileName))))) {
            AtomicInteger i = new AtomicInteger();
            scores.entrySet().forEach(score -> {
                writeLine(writer,
                        "score_(" + i.incrementAndGet() + "/" + scores.size() + ")" + "" + score.getKey());
                Map<String, List<String>> books = score.getValue();
                AtomicInteger j = new AtomicInteger();
                books.entrySet().forEach(book -> {
                    writeLine(writer,
                            "\tbook_(" + j.incrementAndGet() + "/" + books.size() + ")" + "" + book.getKey());
                    bookCount.incrementAndGet();
                    AtomicInteger k = new AtomicInteger();
                    book.getValue().forEach(sentence -> {
                        writeLine(writer, "\t\tsentence_(" + k.incrementAndGet() + "/" + book.getValue().size()
                                + ")" + "" + sentence);
                        sentenceCount.incrementAndGet();
                    });
                });
            });
            writeLine(writer, "??" + sentenceCount.get());
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
        LOGGER.debug("" + scores.keySet().size());
        LOGGER.debug("??" + sentenceCount.get());
        LOGGER.debug("?");
    }

    private static void writeLine(BufferedWriter writer, String text) {
        try {
            writer.write(text + "\n");
        } catch (IOException e) {
            LOGGER.error(e.getMessage(), e);
        }
    }

    public static float score(String sentence, Map<String, AtomicInteger> frequency) {
        //int maxFrequency = frequency.values().parallelStream().max((a,b) -> a.get()-b.get()).get().get();
        //LOGGER.debug("?"+maxFrequency);
        //
        //isDebugEnabled??SO...YOU GOT IT?
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("??" + sentence);
        }
        float score = 0;
        List<String> words = TextAnalyzer.seg(sentence);
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("?" + words);
        }
        for (String word : words) {
            AtomicInteger fre = frequency.get(word);
            if (fre == null || fre.get() == 0) {
                LOGGER.error("?" + word + "??");
                continue;
            }
            int f = fre.get();
            float s = 1 / (float) f;
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("?" + word + "?" + f + "?" + s);
            }
            score += s;
        }
        words.clear();
        score = Math.round(score * 100) / (float) 100;
        LOGGER.debug("" + score);
        return score;
    }

    public static void main(String[] args) {
        TreeMap<Float, Map<String, List<String>>> scores = score("src/main/resources/it");
        toTextFile(scores, "target/sentence_score_rank.txt");
    }
}