org.icgc.dcc.submission.validation.cascading.StructuralCheckFunction.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.submission.validation.cascading.StructuralCheckFunction.java

Source

/*
 * Copyright (c) 2013 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.submission.validation.cascading;

import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.Lists.newArrayList;
import static org.apache.commons.lang.StringUtils.isBlank;

import java.util.ArrayList;
import java.util.List;

import org.icgc.dcc.core.model.SpecialValue;
import org.icgc.dcc.hadoop.cascading.RemoveHollowTupleFilter;

import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;

import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

/**
 * Checks structural aspects of an input data file (header, format, ...)
 * <p>
 * loader's counterpart is {@link PreProcessFunction}
 * <p>
 * Empty lines have already been filtered out in {@link RemoveHollowTupleFilter}.
 * <p>
 * TODO: this should be split in multiple operations.
 */
@SuppressWarnings("rawtypes")
public class StructuralCheckFunction extends BaseOperation implements Function {

    public static final String LINE_FIELD_NAME = "line";

    public static final char FIELD_SEPARATOR = '\t';

    private final Integer headerSize;

    private final Fields dictionaryFields;

    @SuppressWarnings("unchecked")
    public StructuralCheckFunction(Iterable<String> fieldNames) {
        super(1);
        dictionaryFields = new Fields(Iterables.toArray(fieldNames, String.class));
        headerSize = dictionaryFields.size();
        fieldDeclaration = dictionaryFields.append(ValidationFields.STATE_FIELD);
    }

    @Override
    public void operate(FlowProcess flowProcess, FunctionCall functionCall) {
        checkState(fieldDeclaration != null);
        checkState(headerSize != null);

        TupleEntry arguments = functionCall.getArguments();

        long offset = functionCall.getArguments().getLong(ValidationFields.OFFSET_FIELD_NAME);
        TupleState tupleState = new TupleState(offset);

        String line = arguments.getString(LINE_FIELD_NAME);
        List<String> values = parseValues(line);
        List<String> adjustedValues = adjustValues(values, tupleState);

        List<Object> tupleValues = Lists.<Object>newArrayList(adjustedValues);
        tupleValues.add(tupleState); // lastly state
        checkState(fieldDeclaration.size() == tupleValues.size());

        functionCall.getOutputCollector().add(new Tuple(tupleValues.toArray()));
    }

    public static List<String> parseValues(String line) {
        return newArrayList(Splitter.on(FIELD_SEPARATOR).split(line));
    }

    private List<String> adjustValues(List<String> values, TupleState tupleState) {
        checkState(headerSize == values.size(), "'%s' != '%s'", headerSize, values.size());
        return replaceEmptyStrings(convertMissingCodes(values, tupleState));
    }

    private List<String> convertMissingCodes(List<String> values, TupleState tupleState) {
        List<String> adjustedValues = new ArrayList<String>(values.size());
        for (int i = 0; i < values.size(); i++) {
            String value = values.get(i);
            if (SpecialValue.FULL_MISSING_CODES.contains(value)) {
                adjustedValues.add((String) SpecialValue.NO_VALUE);

                // Mark field as originally using a missing code
                tupleState.addMissingField((String) this.getFieldDeclaration().get(i));
            } else {
                adjustedValues.add(value);
            }
        }
        return adjustedValues;
    }

    private List<String> replaceEmptyStrings(List<String> adjustedValues) {
        for (int i = 0; i < adjustedValues.size(); i++) {
            String value = adjustedValues.get(i);
            if (value != null && isBlank(value)) {
                adjustedValues.set(i, null);
            }
        }
        return adjustedValues;
    }

}