de.tudarmstadt.ukp.dkpro.tc.core.task.MetaInfoTask.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.tc.core.task.MetaInfoTask.java

Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.tc.core.task;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.ukp.dkpro.core.io.bincas.BinaryCasReader;
import de.tudarmstadt.ukp.dkpro.lab.engine.TaskContext;
import de.tudarmstadt.ukp.dkpro.lab.storage.StorageService.AccessMode;
import de.tudarmstadt.ukp.dkpro.lab.task.Discriminator;
import de.tudarmstadt.ukp.dkpro.lab.uima.task.impl.UimaTaskBase;
import de.tudarmstadt.ukp.dkpro.tc.api.exception.TextClassificationException;
import de.tudarmstadt.ukp.dkpro.tc.api.features.meta.MetaCollector;
import de.tudarmstadt.ukp.dkpro.tc.core.Constants;
import de.tudarmstadt.ukp.dkpro.tc.core.feature.SequenceContextMetaCollector;
import de.tudarmstadt.ukp.dkpro.tc.core.feature.UnitContextMetaCollector;
import de.tudarmstadt.ukp.dkpro.tc.core.util.TaskUtils;

/**
 * Iterates over all documents and stores required collection-level meta data, e.g. which n-grams
 * appear in the documents.
 * 
 */
public class MetaInfoTask extends UimaTaskBase {

    /**
     * Public name of the task key
     */
    public static final String META_KEY = "meta";
    /**
     * Public name of the folder where meta information will be stored within the task
     */
    public static final String INPUT_KEY = "input";

    private List<String> operativeViews;

    @Discriminator
    protected List<String> featureSet;

    @Discriminator
    private String featureMode;

    @Discriminator
    protected List<Object> pipelineParameters;

    private Set<Class<? extends MetaCollector>> metaCollectorClasses;

    @Discriminator
    private File filesRoot;

    @Discriminator
    private Collection<String> files_training;

    @Override
    public CollectionReaderDescription getCollectionReaderDescription(TaskContext aContext)
            throws ResourceInitializationException, IOException {
        // TrainTest setup: input files are set as imports
        if (filesRoot == null || files_training == null) {
            File root = aContext.getStorageLocation(INPUT_KEY, AccessMode.READONLY);
            Collection<File> files = FileUtils.listFiles(root, new String[] { "bin" }, true);
            return createReaderDescription(BinaryCasReader.class, BinaryCasReader.PARAM_PATTERNS, files);
        }
        // CV setup: filesRoot and files_atrining have to be set as dimension
        else {
            return createReaderDescription(BinaryCasReader.class, BinaryCasReader.PARAM_PATTERNS, files_training);
        }
    }

    @Override
    public AnalysisEngineDescription getAnalysisEngineDescription(TaskContext aContext)
            throws ResourceInitializationException, IOException {

        // check for error conditions
        if (featureSet == null) {
            throw new ResourceInitializationException(
                    new TextClassificationException("No feature extractors have been added to the experiment."));
        }

        // automatically determine the required metaCollector classes from the provided feature
        // extractors
        try {
            metaCollectorClasses = TaskUtils.getMetaCollectorsFromFeatureExtractors(featureSet);
        } catch (ClassNotFoundException e) {
            throw new ResourceInitializationException(e);
        } catch (InstantiationException e) {
            throw new ResourceInitializationException(e);
        } catch (IllegalAccessException e) {
            throw new ResourceInitializationException(e);
        }

        if (featureMode.equals(Constants.FM_UNIT)) {
            // add additional unit context meta collector that extracts the context around text classification units
            // mainly used for error analysis purposes
            metaCollectorClasses.add(UnitContextMetaCollector.class);
        }

        if (featureMode.equals(Constants.FM_SEQUENCE)) {
            metaCollectorClasses.add(SequenceContextMetaCollector.class);
        }

        // collect parameter/key pairs that need to be set
        Map<String, String> parameterKeyPairs = new HashMap<String, String>();
        for (Class<? extends MetaCollector> metaCollectorClass : metaCollectorClasses) {
            try {
                parameterKeyPairs.putAll(metaCollectorClass.newInstance().getParameterKeyPairs());
            } catch (InstantiationException e) {
                throw new ResourceInitializationException(e);
            } catch (IllegalAccessException e) {
                throw new ResourceInitializationException(e);
            }
        }

        List<Object> parameters = new ArrayList<Object>();
        if (pipelineParameters != null) {
            parameters.addAll(pipelineParameters);
        }

        // make sure that the meta key import can be resolved (even when no meta features have been
        // extracted, as in the regression demo)
        // TODO better way to do this?
        if (parameterKeyPairs.size() == 0) {
            File file = new File(aContext.getStorageLocation(META_KEY, AccessMode.READONLY).getPath());
            file.mkdir();
        }

        for (Entry<String, String> entry : parameterKeyPairs.entrySet()) {
            File file = new File(aContext.getStorageLocation(META_KEY, AccessMode.READONLY), entry.getValue());
            parameters.addAll(Arrays.asList(entry.getKey(), file.getAbsolutePath()));
        }

        AggregateBuilder builder = new AggregateBuilder();

        for (Class<? extends MetaCollector> metaCollectorClass : metaCollectorClasses) {
            if (operativeViews != null) {
                for (String viewName : operativeViews) {
                    builder.add(createEngineDescription(metaCollectorClass, parameters.toArray()),
                            CAS.NAME_DEFAULT_SOFA, viewName);
                }
            } else {
                builder.add(createEngineDescription(metaCollectorClass, parameters.toArray()));
            }
        }
        return builder.createAggregateDescription();
    }

    /**
     * @param operativeViews
     */
    public void setOperativeViews(List<String> operativeViews) {
        this.operativeViews = operativeViews;
    }
}