edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator.java Source code

Java tutorial

Introduction

Here is the source code for edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator.java

Source

/**
 * This software is released under the University of Illinois/Research and Academic Use License. See
 * the LICENSE file in the root folder for details. Copyright (c) 2016
 *
 * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
 * http://cogcomp.cs.illinois.edu/
 */
package edu.illinois.cs.cogcomp.datalessclassification.ta;

import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.lang.NotImplementedException;
import org.json.simple.JSONObject;

import edu.illinois.cs.cogcomp.annotation.AnnotatorException;
import edu.illinois.cs.cogcomp.datalessclassification.config.DatalessConfigurator;
import edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager;
import edu.illinois.cs.cogcomp.datalessclassification.representation.w2v.MemoryBasedW2V;
import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer;
import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A wrapper of Word2Vec-based Dataless Classifier for the cogcomp pipeline.
 * 
 * @author shashank
 */

public class W2VDatalessAnnotator extends ADatalessAnnotator {
    private static String NAME = W2VDatalessAnnotator.class.getCanonicalName();
    private static Logger logger = LoggerFactory.getLogger(W2VDatalessAnnotator.class);

    public W2VDatalessAnnotator() {
        this(new W2VDatalessConfigurator().getDefaultConfig());
    }

    public W2VDatalessAnnotator(ResourceManager config) {
        super(ViewNames.DATALESS_W2V, config, true);
    }

    public W2VDatalessAnnotator(ResourceManager config, JSONObject jsonHierarchy) throws NotImplementedException {
        super(ViewNames.DATALESS_W2V, true);
        logger.info("Initializing LabelTree...");
        initializeLabelTree(jsonHierarchy);
        logger.info("LabelTree Initialization Done.");

        logger.info("Initializing Embedding...");
        initializeEmbedding(config);
        logger.info("Embedding Initialization Done.");

        logger.info("Initializing Classifier...");
        initializeClassifier(config);
        logger.info("Classifier Initialization Done.");

        isInitialized = true;
    }

    public W2VDatalessAnnotator(ResourceManager config, String hierarchyPath, String labelNameFile,
            String labelDescFile) {
        super(ViewNames.DATALESS_W2V, true);
        logger.info("Initializing LabelTree...");
        initializeLabelTree(hierarchyPath, labelNameFile, labelDescFile);
        logger.info("LabelTree Initialization Done.");

        logger.info("Initializing Embedding...");
        initializeEmbedding(config);
        logger.info("Embedding Initialization Done.");

        logger.info("Initializing Classifier...");
        initializeClassifier(config);
        logger.info("Classifier Initialization Done.");

        isInitialized = true;
    }

    public W2VDatalessAnnotator(ResourceManager config, Set<String> topNodes, Map<String, Set<String>> childMap,
            Map<String, String> labelNameMap, Map<String, String> labelDescMap) {
        super(ViewNames.DATALESS_W2V, true);
        logger.info("Initializing LabelTree...");
        initializeLabelTree(topNodes, childMap, labelNameMap, labelDescMap);
        logger.info("LabelTree Initialization Done.");

        logger.info("Initializing Embedding...");
        initializeEmbedding(config);
        logger.info("Embedding Initialization Done.");

        logger.info("Initializing Classifier...");
        initializeClassifier(config);
        logger.info("Classifier Initialization Done.");

        isInitialized = true;
    }

    @Override
    protected String getClassName() {
        return NAME;
    }

    /**
     * Initializes the Word2Vec Embedding that will be used for computing the representations
     */
    protected void initializeEmbedding(ResourceManager config) {
        conceptWeights = new HashMap<>();
        embedding_dim = config.getInt(W2VDatalessConfigurator.W2V_DIM);
        embedding = new MemoryBasedW2V(config);
    }

    /**
     * @param args config: config file path testFile: Test File
     */
    public static void main(String[] args) {
        CommandLine cmd = ESADatalessAnnotator.getCMDOpts(args);

        ResourceManager rm;

        try {
            String configFile = cmd.getOptionValue("config", "config/project.properties");
            ResourceManager nonDefaultRm = new ResourceManager(configFile);

            rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm);
        } catch (IOException e) {
            rm = new W2VDatalessConfigurator().getDefaultConfig();
        }

        String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt");

        StringBuilder sb = new StringBuilder();

        String line;

        try (BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) {
            while ((line = br.readLine()) != null) {
                sb.append(line);
                sb.append(" ");
            }

            String text = sb.toString().trim();

            TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
            TextAnnotation ta = taBuilder.createTextAnnotation(text);

            W2VDatalessAnnotator datalessAnnotator = new W2VDatalessAnnotator(rm);
            datalessAnnotator.addView(ta);

            List<Constituent> annots = ta.getView(ViewNames.DATALESS_W2V).getConstituents();

            System.out.println("Predicted LabelIDs:");

            for (Constituent annot : annots) {
                System.out.println(annot.getLabel());
            }

            Map<String, String> labelNameMap = DatalessAnnotatorUtils
                    .getLabelNameMap(rm.getString(DatalessConfigurator.LabelName_Path.key));

            System.out.println("Predicted Labels:");

            for (Constituent annot : annots) {
                System.out.println(labelNameMap.get(annot.getLabel()));
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
            logger.error("Test File not found at " + testFile + " ... exiting");
            System.exit(-1);
        } catch (AnnotatorException e) {
            e.printStackTrace();
            logger.error("Error Annotating the Test Document with the Dataless View ... exiting");
            System.exit(-1);
        } catch (IOException e) {
            e.printStackTrace();
            logger.error("IO Error while reading the test file ... exiting");
            System.exit(-1);
        }
    }
}