List of usage examples for java.io BufferedInputStream available
public synchronized int available() throws IOException
From source file:edu.harvard.iq.dvn.ingest.statdataio.impl.plugins.sav.SAVFileReader.java
void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOException { dbgLog.fine("***** decodeRecordTypeDataCompressed(): start *****"); if (stream == null) { throw new IllegalArgumentException("decodeRecordTypeDataCompressed: stream == null!"); }// w ww .jav a2 s . c om Map<String, String> formatCategoryTable = new LinkedHashMap<String, String>(); PrintWriter pwout = createOutputWriter(stream); boolean hasStringVarContinuousBlock = obsNonVariableBlockSet.size() > 0 ? true : false; dbgLog.fine("hasStringVarContinuousBlock=" + hasStringVarContinuousBlock); int ii = 0; int OBS = LENGTH_SAV_OBS_BLOCK; int nOBS = OBSUnitsPerCase; dbgLog.fine("OBSUnitsPerCase=" + OBSUnitsPerCase); int caseIndex = 0; dbgLog.fine("printFormatTable:\n" + printFormatTable); dbgLog.fine("printFormatNameTable:\n" + printFormatNameTable); variableFormatTypeList = new String[varQnty]; dbgLog.fine("varQnty: " + varQnty); for (int i = 0; i < varQnty; i++) { variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE .get(printFormatTable.get(variableNameList.get(i))); dbgLog.fine("i=" + i + "th variableFormatTypeList=" + variableFormatTypeList[i]); formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); } dbgLog.fine("variableFormatType:\n" + Arrays.deepToString(variableFormatTypeList)); dbgLog.fine("formatCategoryTable:\n" + formatCategoryTable); // contents (variable) checker concering decimals variableTypeFinal = new int[varQnty]; Arrays.fill(variableTypeFinal, 0); List<String> casewiseRecordForUNF = new ArrayList<String>(); String[] caseWiseDateFormatForUNF = null; List<String> casewiseRecordForTabFile = new ArrayList<String>(); // missing values are written to the tab-delimited file by // using the default or user-specified missing-value strings; // however, to calculate UNF/summary statistics, // classes for these calculations require their specific // missing values that differ from the above missing-value // strings; therefore, after row data for the tab-delimited // file are written, missing values in a row are changed to // UNF/summary-statistics-OK ones. // data-storage object for sumStat dataTable2 = new Object[varQnty][caseQnty]; // storage of date formats to pass to UNF dateFormats = new String[varQnty][caseQnty]; try { // this compression is applied only to non-float data, i.e. integer; // 8-byte float datum is kept in tact boolean hasReachedEOF = false; OBSERVATION: while (true) { dbgLog.fine("SAV Reader: compressed: ii=" + ii + "-th iteration"); byte[] octate = new byte[LENGTH_SAV_OBS_BLOCK]; int nbytes = stream.read(octate); // processCompressedOBSblock () // (this means process a block of 8 compressed OBS // values -- should result in 64 bytes of data total) for (int i = 0; i < LENGTH_SAV_OBS_BLOCK; i++) { dbgLog.finer("i=" + i + "-th iteration"); int octate_i = octate[i]; //dbgLog.fine("octate="+octate_i); if (octate_i < 0) { octate_i += 256; } int byteCode = octate_i;//octate_i & 0xF; //out.println("byeCode="+byteCode); // processCompressedOBS switch (byteCode) { case 252: // end of the file dbgLog.fine("SAV Reader: compressed: end of file mark [FC] was found"); hasReachedEOF = true; break; case 253: // FD: uncompressed data follows after this octate // long string datum or float datum // read the following octate byte[] uncompressedByte = new byte[LENGTH_SAV_OBS_BLOCK]; int ucbytes = stream.read(uncompressedByte); int typeIndex = (ii * OBS + i) % nOBS; if ((OBSwiseTypelList.get(typeIndex) > 0) || (OBSwiseTypelList.get(typeIndex) == -1)) { // code= >0 |-1: string or its conitiguous block // decode as a string object String strdatum = new String( Arrays.copyOfRange(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK), defaultCharSet); //out.println("str_datum="+strdatum+"<-"); // add this non-missing-value string datum casewiseRecordForTabFile.add(strdatum); //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); } else if (OBSwiseTypelList.get(typeIndex) == -2) { String strdatum = new String( Arrays.copyOfRange(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK - 1), defaultCharSet); casewiseRecordForTabFile.add(strdatum); //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); } else if (OBSwiseTypelList.get(typeIndex) == 0) { // code= 0: numeric ByteBuffer bb_double = ByteBuffer.wrap(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK); if (isLittleEndian) { bb_double.order(ByteOrder.LITTLE_ENDIAN); } Double ddatum = bb_double.getDouble(); // out.println("ddatum="+ddatum); // add this non-missing-value numeric datum casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)); dbgLog.fine("SAV Reader: compressed: added value to dataLine: " + ddatum); } else { dbgLog.fine("SAV Reader: out-of-range exception"); throw new IOException("out-of-range value was found"); } /* // EOF-check after reading this octate if (stream.available() == 0){ hasReachedEOF = true; dbgLog.fine( "SAV Reader: *** After reading an uncompressed octate," + " reached the end of the file at "+ii +"th iteration and i="+i+"th octate position [0-start] *****"); } */ break; case 254: // FE: used as the missing value for string variables // an empty case in a string variable also takes this value // string variable does not accept space-only data // cf: uncompressed case // 20 20 20 20 20 20 20 20 // add the string missing value // out.println("254: String missing data"); casewiseRecordForTabFile.add(" "); // add "." here? // Note that technically this byte flag (254/xFE) means // that *eight* white space characters should be // written to the output stream. This caused me // a great amount of confusion, because it appeared // to me that there was a mismatch between the number // of bytes advertised in the variable metadata and // the number of bytes actually found in the data // section of a compressed SAV file; this is because // these 8 bytes "come out of nowhere"; they are not // written in the data section, but this flag specifies // that they should be added to the output. // Also, as I pointed out above, we are only writing // out one whitespace character, not 8 as instructed. // This appears to be legit; these blocks of 8 spaces // seem to be only used for padding, and all such // multiple padding spaces are stripped anyway during // the post-processing. break; case 255: // FF: system missing value for numeric variables // cf: uncompressed case (sysmis) // FF FF FF FF FF FF eF FF(little endian) // add the numeric missing value dbgLog.fine("SAV Reader: compressed: Missing Value, numeric"); casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); break; case 0: // 00: do nothing dbgLog.fine("SAV Reader: compressed: doing nothing (zero); "); break; default: //out.println("byte code(default)="+ byteCode); if ((byteCode > 0) && (byteCode < 252)) { // datum is compressed //Integer unCompressed = Integer.valueOf(byteCode -100); // add this uncompressed numeric datum Double unCompressed = Double.valueOf(byteCode - 100); dbgLog.fine("SAV Reader: compressed: default case: " + unCompressed); casewiseRecordForTabFile.add(doubleNumberFormatter.format(unCompressed)); // out.println("uncompressed="+unCompressed); // out.println("dataline="+casewiseRecordForTabFile); } }// end of switch // out.println("end of switch"); // The-end-of-a-case(row)-processing // this line that follows, and the code around it // is really confusing: int varCounter = (ii * OBS + i + 1) % nOBS; // while both OBS and LENGTH_SAV_OBS_BLOCK = 8 // (OBS was initialized as OBS=LENGTH_SAV_OBS_BLOCK), // the 2 values mean different things: // LENGTH_SAV_OBS_BLOCK is the number of bytes in one OBS; // and OBS is the number of OBS blocks that we process // at a time. I.e., we process 8 chunks of 8 bytes at a time. // This is how data is organized inside an SAV file: // 8 bytes of compression flags, followd by 8x8 or fewer // (depending on the flags) bytes of compressed data. // I should rename this OBS variable something more // meaningful. // // Also, the "varCounter" variable name is entirely // misleading -- it counts not variables, but OBS blocks. dbgLog.fine("SAV Reader: compressed: OBS counter=" + varCounter + "(ii=" + ii + ")"); if ((ii * OBS + i + 1) % nOBS == 0) { //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); // out.println("all variables in a case are parsed == nOBS"); // out.println("hasStringVarContinuousBlock="+hasStringVarContinuousBlock); // check whether a string-variable's continuous block exits // if so, they must be joined if (hasStringVarContinuousBlock) { // string-variable's continuous-block-concatenating-processing //out.println("concatenating process starts"); //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); //out.println("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); StringBuilder sb = new StringBuilder(""); int firstPosition = 0; Set<Integer> removeJset = new HashSet<Integer>(); for (int j = 0; j < nOBS; j++) { dbgLog.fine("RTD: j=" + j + "-th type =" + OBSwiseTypelList.get(j)); if ((OBSwiseTypelList.get(j) == -1) || (OBSwiseTypelList.get(j) == -2)) { // Continued String variable found at j-th // position. look back the j-1 firstPosition = j - 1; int lastJ = j; String concatenated = null; removeJset.add(j); sb.append(casewiseRecordForTabFile.get(j - 1)); sb.append(casewiseRecordForTabFile.get(j)); for (int jc = 1;; jc++) { if ((j + jc == nOBS) || ((OBSwiseTypelList.get(j + jc) != -1) && (OBSwiseTypelList.get(j + jc) != -2))) { // j is the end unit of this string variable concatenated = sb.toString(); sb.setLength(0); lastJ = j + jc; break; } else { sb.append(casewiseRecordForTabFile.get(j + jc)); removeJset.add(j + jc); } } casewiseRecordForTabFile.set(j - 1, concatenated); //out.println(j-1+"th concatenated="+concatenated); j = lastJ - 1; } // end-of-if: continuous-OBS only } // end of loop-j //out.println("removeJset="+removeJset); // a new list that stores a new case with concatanated string data List<String> newDataLine = new ArrayList<String>(); for (int jl = 0; jl < casewiseRecordForTabFile.size(); jl++) { //out.println("jl="+jl+"-th datum =["+casewiseRecordForTabFile.get(jl)+"]"); if (!removeJset.contains(jl)) { // if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileString)){ // out.println("NA-S jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileNumeric)){ // out.println("NA-N jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl)==null){ // out.println("null case jl="+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals("NaN")){ // out.println("NaN jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals("")){ // out.println("blank jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals(" ")){ // out.println("space jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } newDataLine.add(casewiseRecordForTabFile.get(jl)); } else { // out.println("Excluded: jl="+jl+"-th datum=["+casewiseRecordForTabFile.get(jl)+"]"); } } // end of loop-jl //out.println("new casewiseRecordForTabFile="+newDataLine); //out.println("new casewiseRecordForTabFile(size)="+newDataLine.size()); casewiseRecordForTabFile = newDataLine; } // end-if: stringContinuousVar-exist case for (int el = 0; el < casewiseRecordForTabFile.size(); el++) { casewiseRecordForUNF.add(casewiseRecordForTabFile.get(el)); } caseWiseDateFormatForUNF = new String[casewiseRecordForTabFile.size()]; // caseIndex starts from 1 not 0 caseIndex = (ii * OBS + i + 1) / nOBS; for (int k = 0; k < casewiseRecordForTabFile.size(); k++) { dbgLog.fine("k=" + k + "-th variableTypelList=" + variableTypelList.get(k)); if (variableTypelList.get(k) > 0) { // String variable case: set to -1 variableTypeFinal[k] = -1; // Strip the String variables off the // whitespace padding: // [ snipped ] // I've removed the block of code above where // String values were substring()-ed to the // length specified in the variable metadata; // Doing that was not enough, since a string // can still be space-padded inside its // advertised capacity. (note that extended // variables can have many kylobytes of such // padding in them!) Plus it was completely // redundant, since we are stripping all the // trailing white spaces with // StringUtils.stripEnd() below: String paddRemoved = StringUtils .stripEnd(casewiseRecordForTabFile.get(k).toString(), null); // TODO: clean this up. For now, just make sure that strings contain at least one blank space. if (paddRemoved.equals("")) { paddRemoved = " "; } casewiseRecordForUNF.set(k, paddRemoved); casewiseRecordForTabFile.set(k, "\"" + paddRemoved.replaceAll("\"", Matcher.quoteReplacement("\\\"")) + "\""); // end of String var case } else { // numeric var case if (casewiseRecordForTabFile.get(k).equals(MissingValueForTextDataFileNumeric)) { casewiseRecordForUNF.set(k, null); } } // end of variable-type check if (casewiseRecordForTabFile.get(k) != null && !casewiseRecordForTabFile.get(k) .equals(MissingValueForTextDataFileNumeric)) { String variableFormatType = variableFormatTypeList[k]; dbgLog.finer("k=" + k + "th printFormatTable format=" + printFormatTable.get(variableNameList.get(k))); int formatDecimalPointPosition = formatDecimalPointPositionList.get(k); if (variableFormatType.equals("date")) { dbgLog.finer("date case"); long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymd.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); caseWiseDateFormatForUNF[k] = sdf_ymd.toPattern(); /* saving date format */ dbgLog.finer("setting caseWiseDateFormatForUNF[k] = " + sdf_ymd.toPattern()); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); //formatCategoryTable.put(variableNameList.get(k), "date"); } else if (variableFormatType.equals("time")) { dbgLog.finer("time case:DTIME or DATETIME or TIME"); //formatCategoryTable.put(variableNameList.get(k), "time"); if (printFormatTable.get(variableNameList.get(k)).equals("DTIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_BIAS; String newDatum = sdf_dhms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_BIAS; StringBuilder sb_time = new StringBuilder( sdf_dhms.format(new Date(dateDatum))); dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymdhms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); caseWiseDateFormatForUNF[k] = sdf_ymdhms.toPattern(); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_OFFSET; StringBuilder sb_time = new StringBuilder( sdf_ymdhms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } caseWiseDateFormatForUNF[k] = sdf_ymdhms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L; String newDatum = sdf_hms.format(new Date(dateDatum)); caseWiseDateFormatForUNF[k] = sdf_hms.toPattern(); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L; StringBuilder sb_time = new StringBuilder( sdf_hms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } caseWiseDateFormatForUNF[k] = this.sdf_hms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } } else if (variableFormatType.equals("other")) { dbgLog.finer("other non-date/time case:=" + i); if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")) { // day of week dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.WEEKDAY_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.WEEKDAY_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); dbgLog.finer("wkday:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")) { // month dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.MONTH_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.MONTH_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); dbgLog.finer("month:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } } } // end: date-time-datum check } // end: loop-k(2nd: variable-wise-check) // write to tab file if (casewiseRecordForTabFile.size() > 0) { pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); } if (casewiseRecordForTabFile.size() > 0) { for (int ij = 0; ij < varQnty; ij++) { dataTable2[ij][caseIndex - 1] = casewiseRecordForUNF.get(ij); if (variableFormatTypeList[ij].equals("date") || variableFormatTypeList[ij].equals("time")) { this.dateFormats[ij][caseIndex - 1] = caseWiseDateFormatForUNF[ij]; } } } // numeric contents-check for (int l = 0; l < casewiseRecordForTabFile.size(); l++) { if (variableFormatTypeList[l].equals("date") || variableFormatTypeList[l].equals("time") || printFormatTable.get(variableNameList.get(l)).equals("WKDAY") || printFormatTable.get(variableNameList.get(l)).equals("MONTH")) { variableTypeFinal[l] = -1; } if (variableTypeFinal[l] == 0) { if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0) { // TODO - check for large numbers // l-th variable is not integer variableTypeFinal[l] = 1; decimalVariableSet.add(l); } } } // reset the case-wise working objects casewiseRecordForUNF.clear(); casewiseRecordForTabFile.clear(); if (caseQnty > 0) { if (caseIndex == caseQnty) { hasReachedEOF = true; } } if (hasReachedEOF) { break; } } // if(The-end-of-a-case(row)-processing) } // loop-i (OBS unit) if ((hasReachedEOF) || (stream.available() == 0)) { // reached the end of this file // do exit-processing dbgLog.fine("***** reached the end of the file at " + ii + "th iteration *****"); break OBSERVATION; } ii++; } // while loop pwout.close(); } catch (IOException ex) { throw ex; } smd.setDecimalVariables(decimalVariableSet); smd.getFileInformation().put("caseQnty", caseQnty); smd.setVariableFormatCategory(formatCategoryTable); dbgLog.info("<<<<<<"); //dbgLog.info("printFormatList = " + printFormatList); //dbgLog.info("printFormatNameTable = " + printFormatNameTable); dbgLog.info("formatCategoryTable = " + formatCategoryTable); dbgLog.info(">>>>>>"); // contents check //out.println("variableType="+ArrayUtils.toString(variableTypeFinal)); dbgLog.fine("decimalVariableSet=" + decimalVariableSet); //out.println("variableTypelList=\n"+ variableTypelList.toString()); // out.println("dataTable2:\n"+Arrays.deepToString(dataTable2)); dbgLog.fine("***** decodeRecordTypeDataCompressed(): end *****"); }