org.apache.ctakes.ytex.kernel.BaseClassifierEvaluationParser.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.ctakes.ytex.kernel.BaseClassifierEvaluationParser.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.ytex.kernel;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
import org.apache.ctakes.ytex.kernel.model.ClassifierEvaluation;
import org.apache.ctakes.ytex.kernel.model.ClassifierInstanceEvaluation;

import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;

/**
 * miscellaneous methods used for parsing various output types
 * 
 * @author vhacongarlav
 * 
 */
public abstract class BaseClassifierEvaluationParser implements ClassifierEvaluationParser {
    private static final Log log = LogFactory.getLog(BaseClassifierEvaluationParser.class);

    public static Pattern wsPattern = Pattern.compile("\\s|\\z");
    public static Pattern wsDotPattern = Pattern.compile("\\s|\\.|\\z");

    private ClassifierEvaluationDao classifierEvaluationDao;

    public static class InstanceClassInfo {
        long instanceId;
        boolean train;
        String targetClassName;

        public InstanceClassInfo() {
            super();
        }

        public InstanceClassInfo(long instanceId, boolean train, String targetClassName) {
            super();
            this.instanceId = instanceId;
            this.train = train;
            this.targetClassName = targetClassName;
        }

        public long getInstanceId() {
            return instanceId;
        }

        public void setInstanceId(long instanceId) {
            this.instanceId = instanceId;
        }

        public boolean isTrain() {
            return train;
        }

        public void setTrain(boolean train) {
            this.train = train;
        }

        public String getTargetClassName() {
            return targetClassName;
        }

        public void setTargetClassName(String targetClassName) {
            this.targetClassName = targetClassName;
        }
    }

    public ClassifierEvaluationDao getClassifierEvaluationDao() {
        return classifierEvaluationDao;
    }

    public void setClassifierEvaluationDao(ClassifierEvaluationDao classifierEvaluationDao) {
        this.classifierEvaluationDao = classifierEvaluationDao;
    }

    public static String extractFirstToken(String line, Pattern tokDelimPattern) {
        Matcher wsMatcher = tokDelimPattern.matcher(line);
        String token = null;
        if (wsMatcher.find() && wsMatcher.start() > 0) {
            token = line.substring(0, wsMatcher.start());
        }
        return token;
    }

    public List<Long> parseInstanceIds(String instanceIdFile) throws IOException {
        BufferedReader instanceIdReader = null;
        List<Long> instanceIds = new ArrayList<Long>();
        try {
            instanceIdReader = new BufferedReader(new FileReader(instanceIdFile));
            String instanceId = null;
            while ((instanceId = instanceIdReader.readLine()) != null)
                instanceIds.add(Long.parseLong(instanceId));
            return instanceIds;
        } catch (FileNotFoundException e) {
            log.warn(instanceIdFile + " not available, instance_ids will not be stored");
            return null;
        } finally {
            if (instanceIdReader != null)
                instanceIdReader.close();
        }
    }

    /**
     * parse a number out of the libsvm command line that matches the specified
     * pattern.
     * 
     * @param pCost
     * @param options
     * @return null if option not present
     */
    protected Double parseDoubleOption(Pattern pCost, String options) {
        Matcher m = pCost.matcher(options);
        if (m.find()) {
            String toParse = m.group(1);
            try {
                return Double.parseDouble(toParse);
            } catch (NumberFormatException nfe) {
                log.warn("could not parse: " + toParse, nfe);
            }
        }
        return null;
    }

    /**
     * 
     * parse a number out of the libsvm command line that matches the specified
     * pattern.
     * 
     * @param pKernel
     * @param options
     * @return null if option not present
     */
    protected Integer parseIntOption(Pattern pKernel, String options) {
        Matcher m = pKernel.matcher(options);
        if (m.find())
            return Integer.parseInt(m.group(1));
        else
            return null;
    }

    protected void initClassifierEvaluation(String instanceIdFile, ClassifierEvaluation eval) {
        eval.setFold(FileUtil.parseFoldFromFileName(instanceIdFile));
        eval.setRun(FileUtil.parseRunFromFileName(instanceIdFile));
        eval.setLabel(FileUtil.parseLabelFromFileName(instanceIdFile));
    }

    protected void initClassifierEvaluationFromProperties(Properties props, ClassifierEvaluation eval) {
        eval.setName(props.getProperty("kernel.name"));
        eval.setExperiment(props.getProperty("kernel.experiment"));
        String strParam1 = props.getProperty("kernel.param1");
        if (strParam1 != null && strParam1.length() > 0)
            eval.setParam1(Double.parseDouble(strParam1));
        eval.setParam2(props.getProperty("kernel.param2"));
        eval.setOptions(props.getProperty(ParseOption.EVAL_LINE.getOptionKey()));
    }

    /**
     * load properties from <tt>outputDir/options.properties</tt>. returns empty
     * properties if the file does not exist
     * 
     * @param outputDir
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     */
    public Properties loadProps(File outputDir) throws FileNotFoundException, IOException {
        return FileUtil.loadProperties(outputDir.getPath() + File.separator + "options.properties", true);
    }

    protected boolean checkFileRead(String file) {
        return (new File(file)).canRead();
    }

    protected String getFileBaseName(Properties kernelProps) {
        return kernelProps.getProperty(ParseOption.DATA_BASENAME.getOptionKey(),
                ParseOption.DATA_BASENAME.getDefaultValue());
    }

    protected void storeSemiSupervised(Properties kernelProps, ClassifierEvaluation ce,
            BiMap<Integer, String> classIdToNameMap) {
        boolean storeInstanceEval = YES.equalsIgnoreCase(kernelProps.getProperty(
                ParseOption.STORE_INSTANCE_EVAL.getOptionKey(), ParseOption.STORE_INSTANCE_EVAL.getDefaultValue()));
        boolean storeUnlabeled = YES.equalsIgnoreCase(kernelProps.getProperty(
                ParseOption.STORE_UNLABELED.getOptionKey(), ParseOption.STORE_UNLABELED.getDefaultValue()));
        boolean storeIR = YES.equalsIgnoreCase(kernelProps.getProperty(ParseOption.STORE_IRSTATS.getOptionKey(),
                ParseOption.STORE_IRSTATS.getDefaultValue()));
        // save the classifier evaluation
        this.getClassifierEvaluationDao().saveClassifierEvaluation(ce, classIdToNameMap,
                storeInstanceEval || storeUnlabeled, storeIR, 0);
    }

    /**
     * used by semil & svmlin to store semisupervised predictions. these train
     * ml and make test predictions in a single step.
     * 
     * @param ce
     *            updated
     * @param listClassInfo
     *            the class info 0 - instance id, 1 - train/test, 2 - target
     *            class id
     * @param storeUnlabeled
     *            should the unlabeled predictions be stored?
     * @param classIds
     *            predicted class ids
     */
    protected void updateSemiSupervisedPredictions(ClassifierEvaluation ce, List<List<Long>> listClassInfo,
            boolean storeUnlabeled, int[] classIds) {
        for (int i = 0; i < classIds.length; i++) {
            List<Long> classInfo = listClassInfo.get(i);
            long instanceId = classInfo.get(0);
            boolean train = classInfo.get(1) == 1;
            int targetClassId = classInfo.get(2).intValue();
            // if we are storing unlabeled instance ids, save this instance
            // evaluation
            // else only store it if this is a test instance id - save it
            if (storeUnlabeled || !train) {
                ClassifierInstanceEvaluation cie = new ClassifierInstanceEvaluation();
                cie.setClassifierEvaluation(ce);
                cie.setInstanceId(instanceId);
                cie.setPredictedClassId(classIds[i]);
                if (targetClassId != 0)
                    cie.setTargetClassId(targetClassId);
                // add the instance eval to the parent
                ce.getClassifierInstanceEvaluations().put(instanceId, cie);
            }
        }
    }

    protected void updateSemiSupervisedPredictions(ClassifierEvaluation ce, List<InstanceClassInfo> listClassInfo,
            boolean storeUnlabeled, String[] predictedClassNames, Map<String, Integer> classNameToIdMap) {
        for (int i = 0; i < predictedClassNames.length; i++) {
            InstanceClassInfo classInfo = listClassInfo.get(i);
            boolean train = classInfo.isTrain();
            // if we are storing unlabeled instance ids, save this instance
            // evaluation
            // else only store it if this is a test instance id - save it
            if (storeUnlabeled || !train) {
                ClassifierInstanceEvaluation cie = new ClassifierInstanceEvaluation();
                cie.setClassifierEvaluation(ce);
                cie.setInstanceId(classInfo.getInstanceId());
                cie.setPredictedClassId(classNameToIdMap.get(predictedClassNames[i]));
                int targetClassId = classNameToIdMap.get(classInfo.getTargetClassName());
                if (targetClassId != 0)
                    cie.setTargetClassId(targetClassId);
                // add the instance eval to the parent
                ce.getClassifierInstanceEvaluations().put(cie.getInstanceId(), cie);
            }
        }
    }

    protected BiMap<Integer, String> loadClassIdMap(File dataDir, String label) throws IOException {
        BiMap<Integer, String> classIndexMap = HashBiMap.create();
        String filename = FileUtil.getScopedFileName(dataDir.getPath(), label, null, null, "class.properties");
        File f = new File(filename);
        if (f.exists()) {
            BufferedReader r = null;
            try {
                r = new BufferedReader(new FileReader(f));
                Properties props = new Properties();
                props.load(r);
                for (String key : props.stringPropertyNames()) {
                    classIndexMap.put(Integer.parseInt(key), props.getProperty(key));
                }
            } finally {
                try {
                    r.close();
                } catch (IOException e) {
                }
            }
        }
        return classIndexMap;
    }

    protected List<InstanceClassInfo> loadInstanceClassInfo(File dataDir, String classFileName) throws IOException {
        List<InstanceClassInfo> listClassInfo = null;
        // load instance ids and their class ids
        BufferedReader r = null;
        try {
            r = new BufferedReader(new FileReader(classFileName));
            listClassInfo = new ArrayList<InstanceClassInfo>();
            String line = null;
            while ((line = r.readLine()) != null) {
                if (line.trim().length() > 0) {
                    String classInfoToks[] = line.split("\\s");
                    if (classInfoToks.length != 3) {
                        log.error("error parsing line: " + line);
                        return null;
                    }
                    listClassInfo.add(new InstanceClassInfo(Long.parseLong(classInfoToks[0]),
                            Integer.parseInt(classInfoToks[1]) != 0, classInfoToks[2]));
                }
            }
        } catch (FileNotFoundException fe) {
            log.warn("class.txt file not available: " + classFileName, fe);
            listClassInfo = null;
        } finally {
            if (r != null) {
                r.close();
            }
        }
        return listClassInfo;
    }

    protected List<List<Long>> loadClassInfo(File dataDir, String classFileName) throws IOException {
        List<List<Long>> listClassInfo = null;
        // load instance ids and their class ids
        BufferedReader r = null;
        try {
            r = new BufferedReader(new FileReader(classFileName));
            listClassInfo = new ArrayList<List<Long>>();
            String line = null;
            while ((line = r.readLine()) != null) {
                if (line.trim().length() > 0) {
                    String classInfoToks[] = line.split("\\s");
                    List<Long> classInfo = new ArrayList<Long>(3);
                    for (String tok : classInfoToks) {
                        classInfo.add(Long.parseLong(tok));
                    }
                    if (classInfo.size() != 3) {
                        log.error("error parsing line: " + line);
                        return null;
                    }
                    listClassInfo.add(classInfo);
                }
            }
        } catch (FileNotFoundException fe) {
            log.warn("class.txt file not available: " + classFileName, fe);
            listClassInfo = null;
        } finally {
            if (r != null) {
                r.close();
            }
        }
        return listClassInfo;
    }
}