org.apdplat.superword.tools.TextAnalyzer.java Source code

Introduction

Here is the source code for org.apdplat.superword.tools.TextAnalyzer.java
Source

/**
 *
 * APDPlat - Application Product Development Platform Copyright (c) 2013, ??,
 * yang-shangchuan@qq.com
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 */
package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * ?
 *
 * @author ??
 */
public class TextAnalyzer {
    private TextAnalyzer() {
    }

    private static final Pattern PATTERN = Pattern.compile("\\d+");
    private static final Pattern UNICODE = Pattern.compile("[uU][0-9a-fA-F]{4}");
    private static final Logger LOGGER = LoggerFactory.getLogger(TextAnalyzer.class);

    /**
     * @param files ?
     * @return ??
     */
    public static Map<String, AtomicInteger> frequency(Collection<String> files) {
        Map<String, AtomicInteger> map = new ConcurrentHashMap<>();
        files.forEach(file -> {
            LOGGER.info("parse text file: " + file);
            //?
            Map<String, AtomicInteger> data = frequency(file);
            //?
            data.entrySet().forEach(entry -> {
                map.putIfAbsent(entry.getKey(), new AtomicInteger());
                map.get(entry.getKey()).addAndGet(entry.getValue().get());
            });
            data.clear();
        });
        LOGGER.info("total unique words count: " + map.size());
        return map;
    }

    public static Map<String, AtomicInteger> frequency(String file) {
        try {
            return frequency(new FileInputStream(file));
        } catch (IOException e) {
            e.printStackTrace();
        }
        return Collections.emptyMap();
    }

    public static Map<String, AtomicInteger> frequency(InputStream inputStream) {
        Map<String, AtomicInteger> map = new ConcurrentHashMap<>();

        try (BufferedReader reader = new BufferedReader(
                new InputStreamReader(new BufferedInputStream(inputStream)))) {
            String line = null;
            while ((line = reader.readLine()) != null) {
                if (StringUtils.isBlank(line)) {
                    continue;
                }
                List<String> words = seg(line);
                words.forEach(word -> {
                    map.putIfAbsent(word, new AtomicInteger());
                    map.get(word).incrementAndGet();
                });
                words.clear();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        LOGGER.info("unique words count: " + map.size());
        return map;
    }

    /**
     * ?
     * @param sentence
     * @return
     */
    public static List<String> seg(String sentence) {
        List<String> data = new ArrayList<>();
        //??
        String[] words = sentence.trim().split("[^a-zA-Z0-9]");
        StringBuilder log = new StringBuilder();
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("??:" + sentence);
        }
        for (String word : words) {
            if (StringUtils.isBlank(word) || word.length() < 2) {
                continue;
            }
            List<String> list = new ArrayList<>();
            //??
            if (word.length() < 6
                    //PostgreSQL
                    || (Character.isUpperCase(word.charAt(word.length() - 1))
                            && Character.isUpperCase(word.charAt(0)))
                    //P2P,Neo4j
                    || PATTERN.matcher(word).find() || StringUtils.isAllUpperCase(word)) {
                word = word.toLowerCase();
            }
            //???
            int last = 0;
            for (int i = 1; i < word.length(); i++) {
                if (Character.isUpperCase(word.charAt(i)) && Character.isLowerCase(word.charAt(i - 1))) {
                    list.add(word.substring(last, i));
                    last = i;
                }
            }
            if (last < word.length()) {
                list.add(word.substring(last, word.length()));
            }
            list.stream().map(w -> w.toLowerCase()).forEach(w -> {
                if (w.length() < 2) {
                    return;
                }
                w = irregularity(w);
                if (StringUtils.isNotBlank(w)) {
                    data.add(w);
                    if (LOGGER.isDebugEnabled()) {
                        log.append(w).append(" ");
                    }
                }
            });
        }
        LOGGER.debug("?" + log);
        return data;
    }

    /**
     * ????
     * @param word
     * @return
     */
    private static String irregularity(String word) {
        if (Character.isDigit(word.charAt(0))) {
            LOGGER.debug("?" + word);
            return null;
        }
        if (word.startsWith("0x") || word.startsWith("0X")) {
            LOGGER.debug("?16" + word);
            return null;
        }
        if (word.endsWith("l") && StringUtils.isNumeric(word.substring(0, word.length() - 1))) {
            LOGGER.debug("?long" + word);
            return null;
        }
        if (UNICODE.matcher(word).find()) {
            LOGGER.debug("?UNICODE?" + word);
            return null;
        }
        switch (word) {
        //Ill do it. You'll see.
        case "ll":
            return "will";
        //If youre already building applications using Spring.
        case "re":
            return "are";
        //package com.manning.sdmia.ch04;
        case "ch":
            return "chapter";
        //you find youve made a
        case "ve":
            return "have";
        //but it doesnt stop there.
        case "doesn":
            return "does";
        //but it isnt enough.
        case "isn":
            return "is";
        //<input type="text" name="firstName" /><br/>
        case "br":
            return null;
        }
        return word;
    }

    /**
     *  {? : ?} ?{? : ??10?}
     * @param data ?
     * @return ?
     */
    public static Map<Integer, Stat> distribute(Map<String, AtomicInteger> data) {
        Map<Integer, Stat> stat = new HashMap<>();
        data.entrySet().forEach(entry -> {
            Integer key = entry.getValue().get();
            stat.putIfAbsent(key, new Stat());
            stat.get(key).increment();
            stat.get(key).addWords(entry.getKey());
        });
        return stat;
    }

    /**
     * ?
     * @param path
     */
    public static void parse(String path) {
        //?  
        Set<String> fileNames = getFileNames(path);
        //?
        Map<String, AtomicInteger> data = frequency(fileNames);
        //
        String htmlFragment = HtmlFormatter.toHtmlFragmentForText(data, fileNames);
        try {
            //?
            String resultFile = "target/words_" + Paths.get(path).toFile().getName().replace(".txt", "") + ".txt";
            Files.write(Paths.get(resultFile), htmlFragment.getBytes("utf-8"));
            LOGGER.info("" + resultFile);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static Set<String> getFileNames(String path) {
        Set<String> fileNames = new HashSet<>();
        if (Files.isDirectory(Paths.get(path))) {
            LOGGER.info("?" + path);
        } else {
            LOGGER.info("?" + path);
            fileNames.add(path);
            return fileNames;
        }
        try {
            Files.walkFileTree(Paths.get(path), new SimpleFileVisitor<Path>() {

                @Override
                public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                    if (file.toFile().getName().startsWith(".")) {
                        return FileVisitResult.CONTINUE;
                    }
                    String fileName = file.toFile().getAbsolutePath();
                    if (!fileName.endsWith(".txt")) {
                        LOGGER.info("??txt" + fileName);
                        return FileVisitResult.CONTINUE;
                    }
                    fileNames.add(fileName);
                    return FileVisitResult.CONTINUE;
                }

            });
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fileNames;
    }

    /**
     *
     * @param path ??
     * @param limit ???
     * @param isTopN ???
     */
    public static TreeMap<Float, String> sentence(String path, int limit, boolean isTopN) {
        //?  
        Set<String> fileNames = getFileNames(path);
        //?
        Map<String, AtomicInteger> frequency = frequency(fileNames);
        //?
        TreeMap<Float, String> sentences = new TreeMap<>();
        //??
        int count = 0;
        for (String fileName : fileNames) {
            try (BufferedReader reader = new BufferedReader(
                    new InputStreamReader(new BufferedInputStream(new FileInputStream(fileName))))) {
                String line = null;
                while ((line = reader.readLine()) != null) {
                    if (StringUtils.isBlank(line)) {
                        continue;
                    }
                    //
                    float score = 0;
                    List<String> words = seg(line);
                    for (String word : words) {
                        AtomicInteger fre = frequency.get(word);
                        if (fre == null || fre.get() == 0) {
                            LOGGER.error("????" + line);
                            score = 0;
                            break;
                        }
                        score += 1 / (float) fre.get();
                    }
                    words.clear();
                    if (score > 0) {
                        //???
                        if (sentences.get(score) != null) {
                            continue;
                        }
                        sentences.put(score, line + " <u><i>"
                                + Paths.get(fileName).toFile().getName().replace(".txt", "") + "</i></u>");
                        count++;
                        if (count >= limit) {
                            if (isTopN) {
                                //
                                sentences.pollFirstEntry();
                            } else {
                                //
                                sentences.pollLastEntry();
                            }
                        }
                    }
                }
            } catch (IOException ex) {
                LOGGER.error("??", ex);
            }
        }
        return sentences;
    }

    /**
     * ??
     * @param textPath
     * @param dicPath
     */
    public static void toDic(String textPath, String dicPath) {
        Map<String, AtomicInteger> data = frequency(getFileNames(textPath));
        List<String> words = data.entrySet().stream()
                .filter(w -> StringUtils.isAlpha(w.getKey()) && w.getKey().length() < 12)
                .sorted((a, b) -> b.getValue().get() - a.getValue().get())
                .map(e -> e.getValue() + "\t" + e.getKey()).collect(Collectors.toList());
        try {
            Files.write(Paths.get(dicPath), words);
        } catch (IOException e) {
            LOGGER.error("??", e);
        }
    }

    /**
     *  CET4?CET6?GRE?IELTS?TOEFL??
     *  
     * @param textPath
     * @return
     */
    public static String importantWords(String textPath) {
        Set<Word> wordSet = WordSources.get("/word_CET4.txt", "/word_CET6.txt", "/word_GRE.txt", "/word_IELTS.txt",
                "/word_TOEFL.txt", "/word_ .txt");
        Map<Word, AtomicInteger> data = WordSources.convert(frequency(getFileNames(textPath)));
        Set<Map.Entry<Word, AtomicInteger>> entries = data.entrySet().stream()
                .filter(entry -> wordSet.contains(entry.getKey())).collect(Collectors.toSet());
        return HtmlFormatter.toHtmlTableFragment(entries, 5);
    }

    public static void main(String[] args) throws Exception {
        //parse("src/main/resources/it/spring/Spring in Action 4th Edition.txt");
        //parse("src/main/resources/it/spring");
        //parse("src/main/resources/it");
        //toDic("src/main/resources/it", "src/main/resources/word_it.txt");
        System.out.print(importantWords("src/main/resources/it"));
    }

    public static class Stat {
        private AtomicInteger count = new AtomicInteger();
        private List<String> words = new ArrayList<>();

        public int count() {
            return count.get();
        }

        public void increment() {
            count.incrementAndGet();
        }

        public List<String> getWords() {
            return words;
        }

        public void addWords(String word) {
            if (this.words.size() < 11) {
                this.words.add(word);
            }
        }
    }
}