reviews.indexing.HTMLDocument.java Source code

Introduction

Here is the source code for reviews.indexing.HTMLDocument.java
Source

package reviews.indexing;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.json.JSONException;
import org.json.JSONObject;

import reviews.indexing.tokenizing.ReviewContentCleaner;
import reviews.indexing.tokenizing.SWN.SWN;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

/** A utility for making Lucene Documents for HTML documents. */

public class HTMLDocument {

    static char dirSep = System.getProperty("file.separator").charAt(0);

    public static String uid(File f) {
        // Append path and date into a string in such a way that lexicographic
        // sorting gives the same results as a walk of the file hierarchy. Thus
        // null (\u0000) is used both to separate directory components and to
        // separate the path from the date.
        return f.getPath().replace(dirSep, '\u0000') + "\u0000"
                + DateTools.timeToString(f.lastModified(), DateTools.Resolution.SECOND);
    }

    public static String uid2url(String uid) {
        String url = uid.replace('\u0000', '/'); // replace nulls with slashes
        return url.substring(0, url.lastIndexOf('/')); // remove date from end
    }

    private static HashMap<String, ArrayList<FeatureMapData>> processReviewSegment(StanfordCoreNLP pipeline,
            SWN swn, HashMap<String, ArrayList<FeatureMapData>> featureMap, String segment, double scoreBonus,
            double scoreModifier) {

        Annotation document = new Annotation(segment);

        // run all Annotators on this text
        pipeline.annotate(document);
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);

        OpinionAnalyzer opinion = new OpinionAnalyzer(swn);

        for (CoreMap sentence : sentences) {

            // sentence features set, used to add a sentence for a feature only once
            Set<String> sfSet = new HashSet<String>();

            // traversing the words in the current sentence
            //         System.out.println("Has the following words: ");
            for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
                // this is the text of the token
                String word = token.get(TextAnnotation.class);
                String pos = token.get(PartOfSpeechAnnotation.class); // used for debugging

                if (IndexReviews.FEATURE_SET.contains(word)) {
                    sfSet.add(word);
                }
                //            System.out.print(word + "#" + pos + "  ");
            }
            //         System.out.println();
            //         System.out.println(opinion.getSentenceScore("", sentence));

            // for each identified feature, update featureMap
            for (String word : sfSet) {
                double score = opinion.getSentenceScore(word, sentence);
                score += scoreBonus; // apply score bonus
                score *= scoreModifier; // apply score modifier

                // check to see if we have any more sentences for this feature
                ArrayList<FeatureMapData> featureFeedback = featureMap.get(word);

                if (featureFeedback == null) {
                    featureFeedback = new ArrayList<FeatureMapData>();
                }

                // add a new sentence
                featureFeedback.add(new FeatureMapData(sentence.get(TextAnnotation.class), score,
                        ((score >= 0) ? true : false)));

                featureMap.put(word, featureFeedback);
            }
        }

        return featureMap;
    }

    public static Document Document(File f, SWN swn) throws IOException, InterruptedException {
        // make a new, empty document
        Document doc = new Document();

        // Add the url as a field named "path". Use a field that is
        // indexed (i.e. searchable), but don't tokenize the field into words.
        doc.add(new Field("path", f.getPath().replace(dirSep, '/'), Field.Store.YES, Field.Index.NOT_ANALYZED));

        // Add the last modified date of the file a field named "modified".
        // Use a field that is indexed (i.e. searchable), but don't tokenize
        // the field into words.
        doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE),
                Field.Store.YES, Field.Index.NOT_ANALYZED));

        // Add the uid as a field, so that index can be incrementally
        // maintained.
        // This field is not stored with document, it is indexed, but it is not
        // tokenized prior to indexing.
        doc.add(new Field("uid", uid(f), Field.Store.NO, Field.Index.NOT_ANALYZED));

        FileInputStream fis = new FileInputStream(f);
        HTMLParser parser = new HTMLParser(fis);

        Reader reader = parser.getReader();

        // getting text content
        String contents = "";
        int c;
        while ((c = reader.read()) != -1) {
            char buf[] = Character.toChars(c);
            contents += String.valueOf(buf);
        }

        // clean the review content
        ReviewContentCleaner rcc = new ReviewContentCleaner(contents);

        HashMap<String, ArrayList<FeatureMapData>> featureMap = new HashMap<String, ArrayList<FeatureMapData>>();

        // creates a StanfordCoreNLP object, with POS tagging, parsing
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos");
        props.put("pos.model", "left3words-wsj-0-18.tagger");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        // Here you can adjust the score bonus and the score modifier for review pros and cons.
        // Default, score bonus is +2.0 for pros, -2.0 for cons and 0.0 for summary.
        // Default, score modifier is 1.0 for all.

        // process review pros
        featureMap = processReviewSegment(pipeline, swn, featureMap, rcc.getPros(), 2.0, 1.0);
        // process review cons
        featureMap = processReviewSegment(pipeline, swn, featureMap, rcc.getCons(), -2.0, 1.0);
        // process review summary
        featureMap = processReviewSegment(pipeline, swn, featureMap, rcc.getSummary(), 0.0, 1.0);

        String features = "";
        for (String s : featureMap.keySet()) {
            features += s + " ";
        }

        System.out.println(features);
        doc.add(new Field("features", features, Field.Store.YES, Field.Index.ANALYZED));

        JSONObject serializedFeatureMap = new JSONObject();

        try {
            serializedFeatureMap.put("featureMap", featureMap);

            doc.add(new Field("feature-contents", serializedFeatureMap.toString(), Field.Store.YES,
                    Field.Index.NO));

        } catch (JSONException e) {
            e.printStackTrace();
        }

        //   doc.add(new Field("contents", parser.getReader()));

        // Add the summary as a field that is stored and returned with
        // hit documents for display
        doc.add(new Field("summary", parser.getSummary(), Field.Store.YES, Field.Index.NO));

        // Add the title as a field that it can be searched and that is stored.
        doc.add(new Field("title", parser.getTitle(), Field.Store.YES, Field.Index.ANALYZED));

        // return the document
        return doc;
    }

    private HTMLDocument() {
    }
}