ubic.gemma.core.analysis.preprocess.batcheffects.AffyScanDateExtractor.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.core.analysis.preprocess.batcheffects.AffyScanDateExtractor.java

Source

/*
 * The Gemma project
 *
 * Copyright (c) 2011 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package ubic.gemma.core.analysis.preprocess.batcheffects;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.*;
import java.nio.*;
import java.util.Date;

/**
 * Extract the scan date from Affymetrix CEL files. Handles both version 3 (ASCII) and 4 (binary) files.
 * <p>
 * See <a href='http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html'>CEL</a> and
 * <a href='http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/generic.html'>GENERIC</a>
 * </p>
 * Note that the Affymetrix documentation does not mention a date, explicitly, but it's in the "DatHeader"
 *
 * @author paul
 */
public class AffyScanDateExtractor extends BaseScanDateExtractor {

    private static final Log log = LogFactory.getLog(AffyScanDateExtractor.class);

    @Override
    public Date extract(InputStream is) {

        try (DataInputStream str = new DataInputStream(is)) {

            BufferedReader reader;
            Date date = null;

            int magic = this.readByte(str);
            switch (magic) {
            case 64: {

                int version = this.readIntLittleEndian(str);

                if (version != 4) {
                    // it's always supposed to be.
                    throw new IllegalStateException("Affymetrix CEL format not recognized: " + version);
                }

                AffyScanDateExtractor.log.debug(this.readShort(str)); // numrows

                AffyScanDateExtractor.log.debug(this.readShort(str)); // numcols

                AffyScanDateExtractor.log.debug(this.readIntLittleEndian(str)); // numcells

                int headerLen = this.readShort(str);

                if (headerLen == 0) {
                    // throw new IllegalStateException( "Zero header length read" );
                    headerLen = 800;
                }

                AffyScanDateExtractor.log.debug(headerLen);

                StringBuilder buf = new StringBuilder();

                for (int i = 0; i < headerLen; i++) {
                    buf.append(new String(new byte[] { str.readByte() }, "US-ASCII"));
                }

                String[] headerLines = StringUtils.split(buf.toString(), "\n");

                for (String string : headerLines) {
                    if (string.startsWith("DatHeader")) {
                        date = this.parseStandardFormat(string);
                        break;
                    }
                }
                break;
            }
            case 59: {

                // Command Console format
                int version = this.readUnsignedByte(str);
                if (version != 1) {
                    // this is fixed to 1 according to affy docs.
                    throw new IllegalStateException("Affymetrix CEL format not recognized: " + version);
                }
                @SuppressWarnings("unused")
                int numDataGroups = this.readIntBigEndian(str); // number of data groups, usually = 1. Each data group

                // contains another header, with different name/value/type
                // triples.
                @SuppressWarnings("unused")
                int filePosOfFirstGroup = this.readIntBigEndian(str); // file position of first data group.

                date = this.parseGenericCCHeader(str);

                // need to find number of parent file headers and then read array of generic file headers

                if (date == null) {
                    /*
                     * Look in the parentheader
                     */

                    date = this.parseGenericCCHeader(str);
                }

                // $dataHeader$parents[[1]]$parameters$`affymetrix-scan-date`

                // also $dataHeader$parents[[1]]$parameters$`affymetrix-Fluidics-HybDate`

                break;
            }
            default:

                /*
                 * assume version 3 plain text.
                 */
                reader = new BufferedReader(new InputStreamReader(is));
                String line;
                int count = 0;
                while ((line = reader.readLine()) != null) {
                    // log.info( line );
                    if (line.startsWith("DatHeader")) {
                        date = this.parseStandardFormat(line);
                    }
                    if (date != null || ++count > 100) { // don't read too much.
                        reader.close();
                        break;
                    }
                }
                break;
            }

            if (date == null) {
                throw new IllegalStateException("Failed to find date");
            }
            AffyScanDateExtractor.log.debug(date);

            return date;

        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    @SuppressWarnings({ "unused", "WeakerAccess" }) // Possible external use
    public Date parseGenericCCHeader(DataInputStream str) throws IOException {

        /*
         * acquisition data, intensity data etc. Usually "intensity data" for the first header.
         */
        String datatypeIdentifier = this.readString(str);

        AffyScanDateExtractor.log.debug(datatypeIdentifier);

        String guid = this.readString(str);

        AffyScanDateExtractor.log.debug(guid);

        @SuppressWarnings("unused")
        String createDate = this.readUnicodeString(str); // blank?
        @SuppressWarnings("unused")
        String locale = this.readUnicodeString(str); // e.g. en-US
        int numKeyValuePairs = this.readIntBigEndian(str); // e.g. 55
        Date result = null;
        for (int i = 0; i < numKeyValuePairs; i++) {
            String name = this.readUnicodeString(str);
            byte[] value = this.readBytes(str);
            String type = this.readUnicodeString(str);
            Object v;

            switch (type) {
            case "text/x-calvin-float": {
                FloatBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asFloatBuffer();
                float[] array = new float[intBuf.remaining()];
                intBuf.get(array);
                break;
            }
            case "text/plain":
            case "text/ascii":
                // text/ascii is undocumented, but needed.
                v = new String(value, "US-ASCII");

                if (name.equals("affymetrix-scan-date")) {
                    result = this.parseISO8601(new String(((String) v).getBytes(), "UTF-16"));
                    AffyScanDateExtractor.log.info("Scan date = " + result);
                }

                if (name.equals("affymetrix-Hyb-Start-Time")) {
                    // We don't use this but I'm curious to start looking at it.
                    AffyScanDateExtractor.log
                            .info("Hyb start date = " + new String(((String) v).getBytes(), "UTF-16"));
                }

                break;
            case "text-x-calvin-unsigned-integer-8": {
                ShortBuffer intBuf = ByteBuffer.wrap(value).asShortBuffer();
                short[] array = new short[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-integer-16": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer(); // wrong?

                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);
                break;
            }
            case "text/x-calvin-integer-32": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();
                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-8": {
                ShortBuffer intBuf = ByteBuffer.wrap(value).asShortBuffer();
                short[] array = new short[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-16": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();// wrong?

                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-32": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();
                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            default:
                throw new IOException("Unknown mime type:" + type);
            }

        }

        @SuppressWarnings("unused")
        int numParentHeaders = this.readIntBigEndian(str);
        return result;
    }

    /**
     * @return 8 bit signed integral number
     */
    private int readByte(DataInputStream dis) throws IOException {
        return dis.readByte();
    }

    /**
     * The data is stored as a int, then the array of bytes.
     */
    private byte[] readBytes(DataInputStream str) throws IOException {
        int fieldLength = this.readIntBigEndian(str);

        byte[] result = new byte[fieldLength];
        for (int i = 0; i < fieldLength; i++) {
            if (str.available() == 0)
                throw new IOException("Reached end of file without string end");
            result[i] = str.readByte();
        }

        return result;
    }

    private int readIntBigEndian(DataInputStream dis) throws IOException {
        byte[] buf = new byte[4];

        for (int i = 0; i < buf.length; i++) {
            buf[i] = dis.readByte();
        }

        return ByteBuffer.wrap(buf).order(ByteOrder.BIG_ENDIAN).getInt();
    }

    /**
     * @return 32 bit signed integral number
     */
    private int readIntLittleEndian(DataInputStream dis) throws IOException {
        return dis.readInt();
    }

    private int readShort(DataInputStream dis) throws IOException {
        return dis.readShort();
    }

    /**
     * @return A 1 byte character string. A string object is stored as an INT (to store the string length) followed by the CHAR
     * array (to store the string contents).
     */
    private String readString(DataInputStream str) throws IOException {
        int fieldLength = this.readIntBigEndian(str);
        StringBuilder buf = new StringBuilder();
        for (int i = 0; i < fieldLength; i++) {
            if (str.available() == 0)
                throw new IOException("Reached end of file without string end");
            buf.append(new String(new byte[] { str.readByte() }));
        }
        return buf.toString();
    }

    /**
     * @return A UNICODE string. A string object is stored as an INT (to store the string length) followed by the WCHAR array
     * (to store the string contents).
     * (http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/generic.html)
     */
    private String readUnicodeString(DataInputStream str) throws IOException {
        int fieldLength = this.readIntBigEndian(str);

        // log.info( fieldLength );

        byte[] buf = new byte[fieldLength * 2];

        // StringBuilder buf = new StringBuilder();
        for (int i = 0; i < fieldLength * 2; i++) {
            buf[i] = str.readByte();
        }

        return new String(buf, "UTF-16");
    }

    private int readUnsignedByte(DataInputStream dis) throws IOException {
        return dis.readUnsignedByte();
    }

}