List of usage examples for java.io BufferedInputStream available
public synchronized int available() throws IOException
From source file:com.wabacus.WabacusFacade.java
public static void downloadFile(HttpServletRequest request, HttpServletResponse response) { response.setContentType("application/x-msdownload;"); BufferedInputStream bis = null; BufferedOutputStream bos = null; String realfilepath = null;/*from www . j a v a 2 s. com*/ try { bos = new BufferedOutputStream(response.getOutputStream()); String serverfilename = request.getParameter("serverfilename"); String serverfilepath = request.getParameter("serverfilepath"); String newfilename = request.getParameter("newfilename"); if (serverfilename == null || serverfilename.trim().equals("")) { bos.write("????".getBytes()); return; } if (serverfilename.indexOf("/") >= 0 || serverfilename.indexOf("\\") >= 0) { bos.write("?????".getBytes()); return; } if (serverfilepath == null || serverfilepath.trim().equals("")) { bos.write("??".getBytes()); return; } if (newfilename == null || newfilename.trim().equals("")) newfilename = serverfilename; newfilename = WabacusAssistant.getInstance().encodeAttachFilename(request, newfilename); response.setHeader("Content-disposition", "attachment;filename=" + newfilename); //response.setHeader("Content-disposition","inline;filename="+newfilename); String realserverfilepath = null; if (Tools.isDefineKey("$", serverfilepath)) { realserverfilepath = Config.getInstance().getResourceString(null, null, serverfilepath, true); } else { realserverfilepath = WabacusUtils.decodeFilePath(serverfilepath); } if (realserverfilepath == null || realserverfilepath.trim().equals("")) { bos.write(("?" + serverfilepath + "??").getBytes()); } realserverfilepath = WabacusAssistant.getInstance().parseConfigPathToRealPath(realserverfilepath, Config.webroot_abspath); if (Tools.isDefineKey("classpath", realserverfilepath)) { realserverfilepath = Tools.getRealKeyByDefine("classpath", realserverfilepath); realserverfilepath = Tools.replaceAll(realserverfilepath + "/" + serverfilename, "//", "/").trim(); while (realserverfilepath.startsWith("/")) realserverfilepath = realserverfilepath.substring(1);//???ClassLoader?Class?/ bis = new BufferedInputStream( ConfigLoadManager.currentDynClassLoader.getResourceAsStream(realserverfilepath)); response.setContentLength(bis.available()); } else { File downloadFileObj = new File(FilePathAssistant.getInstance() .standardFilePath(realserverfilepath + File.separator + serverfilename)); if (!downloadFileObj.exists() || downloadFileObj.isDirectory()) { bos.write(("?" + serverfilename).getBytes()); return; } //response.setHeader("Content-Length", String.valueOf(downloadFileObj.length())); bis = new BufferedInputStream(new FileInputStream(downloadFileObj)); } byte[] buff = new byte[1024]; int bytesRead; while ((bytesRead = bis.read(buff, 0, buff.length)) != -1) { bos.write(buff, 0, bytesRead); } } catch (IOException e) { throw new WabacusRuntimeException("" + realfilepath + "", e); } finally { try { if (bis != null) bis.close(); } catch (IOException e) { log.warn("" + realfilepath + "?", e); } try { if (bos != null) bos.close(); } catch (IOException e) { log.warn("" + realfilepath + "?", e); } } }
From source file:org.apache.carbondata.sdk.file.ImageTest.java
public void binaryToCarbonWithHWD(String sourceImageFolder, String outputPath, String preDestPath, String sufAnnotation, final String sufImage, int numToWrite) throws Exception { int num = 1;/* w w w. ja v a2 s. c om*/ Field[] fields = new Field[7]; fields[0] = new Field("height", DataTypes.INT); fields[1] = new Field("width", DataTypes.INT); fields[2] = new Field("depth", DataTypes.INT); fields[3] = new Field("binaryName", DataTypes.STRING); fields[4] = new Field("binary", DataTypes.BINARY); fields[5] = new Field("labelName", DataTypes.STRING); fields[6] = new Field("labelContent", DataTypes.STRING); byte[] originBinary = null; // read and write image data for (int j = 0; j < num; j++) { Object[] files = listFiles(sourceImageFolder, sufImage).toArray(); int index = 0; if (null != files) { CarbonWriter writer = CarbonWriter.builder().outputPath(outputPath).withCsvInput(new Schema(fields)) .withBlockSize(256).writtenBy("SDKS3Example").withPageSizeInMb(1).build(); for (int i = 0; i < files.length; i++) { if (0 == index % numToWrite) { writer.close(); writer = CarbonWriter.builder().outputPath(outputPath).withCsvInput(new Schema(fields)) .withBlockSize(256).writtenBy("SDKS3Example").withPageSizeInMb(1).build(); } index++; // read image and encode to Hex File file = new File((String) files[i]); System.out.println(file.getCanonicalPath()); BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file)); int depth = 0; boolean isGray; boolean hasAlpha; BufferedImage bufferedImage = null; try { bufferedImage = ImageIO.read(file); isGray = bufferedImage.getColorModel().getColorSpace().getType() == ColorSpace.TYPE_GRAY; hasAlpha = bufferedImage.getColorModel().hasAlpha(); if (isGray) { depth = 1; } else if (hasAlpha) { depth = 4; } else { depth = 3; } } catch (Exception e) { e.printStackTrace(); System.out.println(i); ImageInputStream stream = new FileImageInputStream(new File(file.getCanonicalPath())); Iterator<ImageReader> iter = ImageIO.getImageReaders(stream); Exception lastException = null; while (iter.hasNext()) { ImageReader reader = null; try { reader = (ImageReader) iter.next(); ImageReadParam param = reader.getDefaultReadParam(); reader.setInput(stream, true, true); Iterator<ImageTypeSpecifier> imageTypes = reader.getImageTypes(0); while (imageTypes.hasNext()) { ImageTypeSpecifier imageTypeSpecifier = imageTypes.next(); System.out .println(imageTypeSpecifier.getColorModel().getColorSpace().getType()); int bufferedImageType = imageTypeSpecifier.getBufferedImageType(); if (bufferedImageType == BufferedImage.TYPE_BYTE_GRAY) { param.setDestinationType(imageTypeSpecifier); break; } } bufferedImage = reader.read(0, param); isGray = bufferedImage.getColorModel().getColorSpace() .getType() == ColorSpace.TYPE_GRAY; hasAlpha = bufferedImage.getColorModel().hasAlpha(); if (isGray) { depth = 1; } else if (hasAlpha) { depth = 4; } else { depth = 3; } if (null != bufferedImage) break; } catch (Exception e2) { lastException = e2; } finally { if (null != reader) reader.dispose(); } } // If you don't have an image at the end of all readers if (null == bufferedImage) { if (null != lastException) { throw lastException; } } } finally { originBinary = new byte[bis.available()]; while ((bis.read(originBinary)) != -1) { } String txtFileName = file.getCanonicalPath().split(sufImage)[0] + sufAnnotation; BufferedInputStream txtBis = new BufferedInputStream(new FileInputStream(txtFileName)); String txtValue = null; byte[] txtBinary = null; txtBinary = new byte[txtBis.available()]; while ((txtBis.read(txtBinary)) != -1) { txtValue = new String(txtBinary, "UTF-8"); } // write data writer.write(new Object[] { bufferedImage.getHeight(), bufferedImage.getWidth(), depth, file.getCanonicalPath(), originBinary, txtFileName, txtValue.replace("\n", "") }); bis.close(); } } writer.close(); } } CarbonReader reader = CarbonReader.builder(outputPath).build(); System.out.println("\nData:"); int i = 0; while (i < 20 && reader.hasNext()) { Object[] row = (Object[]) reader.readNextRow(); byte[] outputBinary = (byte[]) row[1]; System.out.println(row[2] + " " + row[3] + " " + row[4] + " " + row[5] + " image size:" + outputBinary.length + " " + row[0]); // save image, user can compare the save image and original image String destString = preDestPath + i + sufImage; BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(destString)); bos.write(outputBinary); bos.close(); i++; } System.out.println("\nFinished"); reader.close(); }
From source file:org.jab.docsearch.spider.LinkFinder.java
/** * Get all links from page//w w w .ja v a 2 s. c o m */ public void getAllLinks() { // writes links from a page out to a file String urlStr = pageName; String shortUrl = ""; numUnChanged = 0; numSkips = 0; int numSuccesses = 0; int numFailed = 0; int numNoRobots = 0; addLink(urlStr); domainUrl = Utils.getDomainURL(urlStr); if (logger.isDebugEnabled()) { logger.debug("getAllLinks() domain url='" + domainUrl + "'"); } SpiderUrl curl = new SpiderUrl(urlStr); baseUrlFolder = Utils.getBaseURLFolder(urlStr); int curLinkNo = 0; boolean completedSpider = false; boolean isDead = false; int curPread = 0; if (ds != null) { ds.setIsWorking(true); ds.setProgressMax(maxLinksToFind); ds.setCurProgressMSG("Spidering Files..."); } int numSpidered = 0; int curSuccessNo = 0; // start spider while (curLinkNo != -1) { BufferedInputStream urlStream = null; FileOutputStream fileOutStream = null; try { completedSpider = false; isDead = false; if (ds != null) { ds.setCurProgress(curPread); if (!ds.getIsWorking()) { break; } } curLinkNo = getNextUrlNo(); if (curLinkNo == -1) { logger.debug("getAllLinks() end of links reached."); break; } else { urlStr = getLinkNameByNo(curLinkNo); logger.info("getAllLinks() analyzing page='" + urlStr + "'"); curl = getSpiderUrl(curLinkNo); } shortUrl = Utils.concatEnd(urlStr, 33); setStatus(I18n.getString("connecting_to") + " " + shortUrl); // open url URL url = new URL(urlStr); URLConnection conn = url.openConnection(); conn.setDoInput(true); conn.setUseCaches(false); conn.setRequestProperty("User-Agent", "DocSearcher " + I18n.getString("ds.version")); conn.connect(); urlStream = new BufferedInputStream(conn.getInputStream()); // filesize int fileSize = conn.getContentLength(); if (fileSize > maxFileSizeToGet) { String ex = I18n.getString("skipping_file_too_big") + " (" + fileSize + " > " + maxFileSizeToGet + ") " + shortUrl; setStatus(ex); throw new Exception(ex); } setStatus(I18n.getString("downloading_uc") + "... " + shortUrl + " " + fileSize + " " + I18n.getString("bytes")); curl.setSize(fileSize); // last modified long curModified = conn.getLastModified(); // was .getDate(); curl.setLastModified(curModified); // content type String curContentType = netUtils.getContentType(conn); curl.setContentType(curContentType); // build the value for downloadFile String dnldTmpName = getDownloadFileName(curl.getContentType(), urlStr.toLowerCase()); String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName); // TODO it is better to use content type! boolean curIsWebPage = isHtml(urlStr.toLowerCase()) || (curContentType.toLowerCase().indexOf("html") != -1); logger.debug("getAllLinks() saving to " + downloadFile); fileOutStream = new FileOutputStream(downloadFile); int curSize = 0; int curI; int lastPercent = 0; StringBuilder tag = new StringBuilder(); String link = null; boolean inTag = false; boolean getFileSizeFromStream = false; if (fileSize == -1) { getFileSizeFromStream = true; } while ((curI = urlStream.read()) != -1) { fileOutStream.write(curI); curSize++; if (ds != null) { if (!ds.getIsWorking()) { break; } } // fix problem if filesize not in content length if (getFileSizeFromStream) { fileSize = curSize + urlStream.available(); } // notify of download progress if (curSize > 0 && (curSize % 10) == 0) { int curPercent = (curSize * 100) / fileSize; if (curPercent != lastPercent) { lastPercent = curPercent; setStatus(I18n.getString("downloading_uc") + "... : (" + shortUrl + ") --> " + curPercent + " %" + " ( " + (numSuccesses + numFailed + numNoRobots) + "/" + getNumLinksFound() + ")"); } } // end for percent updates else if (curSize % 40 == 0) { setStatus(I18n.getString("downloading_uc") + "... : (" + shortUrl + ") --> " + curSize + " " + I18n.getString("bytes")); } // handle links if (curIsWebPage) { char c = (char) curI; // LOOK AT THE TAGS // start tag if (c == '<') { inTag = true; tag = new StringBuilder(); } // end tag else if (c == '>') { inTag = false; tag.append(c); String realTag = tag.toString(); String lowerTag = realTag.toLowerCase(); // TODO fix problem with spaces before = // link if (lowerTag.startsWith("<a ")) { link = Utils.getTagString("href=", realTag); link = Utils.getNormalUrl(link); doPossibleAdd(urlStr, link); } // area else if (lowerTag.startsWith("<area")) { link = Utils.getTagString("href=", realTag); link = Utils.getNormalUrl(link); doPossibleAdd(urlStr, link); } // TODO is in param realy a link? else if (lowerTag.startsWith("<param")) { String appletParam = Utils.getTagString("name=", realTag); if (appletParam.toLowerCase().equals("url")) { link = Utils.getTagString("value=", realTag); link = Utils.getNormalUrl(link); doPossibleAdd(urlStr, link); } } } // in tag if (inTag) { tag.append(c); } } // filesize ok if (getFileSizeFromStream && fileSize > maxFileSizeToGet) { break; } } // end while downloading curPread++; fileOutStream.close(); urlStream.close(); curl.setMd5(FileUtils.getMD5Sum(downloadFile)); // now add out document if (ds != null) { curSuccessNo = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curl); switch (curSuccessNo) { case 0: // good numSuccesses++; break; case 1: // bad numFailed++; break; case 2: // meta robots - no index numNoRobots++; break; } } // delete temp file if (!FileUtils.deleteFile(downloadFile)) { logger.warn("getAllLinks() can't delete file '" + downloadFile + "'"); } numSpidered++; completedSpider = true; // max links found if (numSpidered > maxLinksToFind) { break; } } catch (Exception e) { logger.fatal("getAllLinks() failed", e); setStatus(I18n.getString("error") + " : " + e.toString()); isDead = true; } finally { // close resources IOUtils.closeQuietly(urlStream); IOUtils.closeQuietly(fileOutStream); curl.setSpidered(completedSpider); curl.setIsDeadLink(isDead); setStatus(I18n.getString("download_complete") + " " + shortUrl); } } // end for iterating over links if (ds != null) { ds.resetProgress(); } saveAllLinks(); logger.info("getAllLinks() " + numSpidered + " total web pages spidered for links."); showMessage(I18n.getString("spidering_complete") + " (" + Utils.concatStrToEnd(pageName, 28) + ") ", numSpidered + " " + I18n.getString("documents_indexed") + " " + getNumLinksFound() + " " + I18n.getString("links_found") + "\n\n" + numSuccesses + " " + I18n.getString("documents_spidered_successful") + "\n\n" + numFailed + " " + I18n.getString("documents_spidered_failed") + "\n\n" + numNoRobots + " " + I18n.getString("documents_not_spidered")); }
From source file:edu.harvard.iq.dvn.ingest.statdataio.impl.plugins.dta.DTAFileReader.java
void parseValueLabelsRelease105(BufferedInputStream stream) throws IOException { dbgLog.fine("***** parseValueLabelsRelease105(): start *****"); if (stream == null) { throw new IllegalArgumentException("stream == null!"); }/*from w w w .j av a 2s . c o m*/ int nvar = (Integer) smd.getFileInformation().get("varQnty"); int length_label_name = constantTable.get("NAME") + 1; // note: caution +1 as the null character, not 9 byte int length_value_label_header = value_label_table_length + length_label_name; if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value_label_table_length=" + value_label_table_length); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_value_label_header=" + length_value_label_header); int length_lable_name_field = 8; /* Seg field byte type 1-1. no of pairs 2 int (= m) 1-2. vlt_name 10 includes char+(\0) == name used in Sec2.part 5 ----------------------------------- 11 2-1. values 2*n int[] 2-2. labels 8*n char */ for (int i = 0; i < nvar; i++) { if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("\n\n" + i + "th value-label table header"); byte[] valueLabelHeader = new byte[length_value_label_header]; // Part 1: reading the header of a value-label table if exists int nbytes = stream.read(valueLabelHeader, 0, length_value_label_header); if (nbytes == 0) { throw new IOException("reading value label header: no datum"); } // 1.1 number of value-label pairs in this table (= m) ByteBuffer bb_value_label_pairs = ByteBuffer.wrap(valueLabelHeader, 0, value_label_table_length); if (isLittleEndian) { bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN); //if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value lable table lenth: byte reversed"); } int no_value_label_pairs = bb_value_label_pairs.getShort(); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs); // 1.2 labelName String rawLabelName = new String(Arrays.copyOfRange(valueLabelHeader, value_label_table_length, (value_label_table_length + length_label_name)), "ISO-8859-1"); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("rawLabelName(length)=" + rawLabelName.length()); String labelName = rawLabelName.substring(0, rawLabelName.indexOf(0)); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("label name = " + labelName + "\n"); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "-th value-label table"); // Part 2: reading the value-label table // the length of the value-label table is: 2*m + 8*m = 10*m int length_value_label_table = (value_label_table_length + length_lable_name_field) * no_value_label_pairs; if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_value_label_table=" + length_value_label_table); byte[] valueLabelTable_i = new byte[length_value_label_table]; int noBytes = stream.read(valueLabelTable_i, 0, length_value_label_table); if (noBytes == 0) { throw new IOException("reading value label table: no datum"); } // 2-1. 2-byte-integer array (2*m): value array (sorted) short[] valueList = new short[no_value_label_pairs]; int offset_value = 0; for (int k = 0; k < no_value_label_pairs; k++) { ByteBuffer bb_value_list = ByteBuffer.wrap(valueLabelTable_i, offset_value, value_label_table_length); if (isLittleEndian) { bb_value_list.order(ByteOrder.LITTLE_ENDIAN); } valueList[k] = bb_value_list.getShort(); offset_value += value_label_table_length; } if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value_list=" + Arrays.toString(valueList) + "\n"); // 2-2. 8-byte chars that store label data (m units of labels) if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("current offset_value=" + offset_value); int offset_start = offset_value; int offset_end = offset_value + length_lable_name_field; String[] labelList = new String[no_value_label_pairs]; for (int l = 0; l < no_value_label_pairs; l++) { String string_l = new String(Arrays.copyOfRange(valueLabelTable_i, offset_start, offset_end), "ISO-8859-1"); int null_position = string_l.indexOf(0); if (null_position != -1) { labelList[l] = string_l.substring(0, null_position); } else { labelList[l] = string_l; } offset_start = offset_end; offset_end += length_lable_name_field; } Map<String, String> tmpValueLabelTable = new LinkedHashMap<String, String>(); for (int j = 0; j < no_value_label_pairs; j++) { if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(j + "-th pair:" + valueList[j] + "[" + labelList[j] + "]"); tmpValueLabelTable.put(Integer.toString(valueList[j]), labelList[j]); } valueLabelTable.put(labelName, tmpValueLabelTable); if (stream.available() == 0) { // reached the end of this file // do exit-processing if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("***** reached the end of the file at " + i + "th value-label Table *****"); break; } } // for-loop if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("valueLabelTable:\n" + valueLabelTable); smd.setValueLabelTable(valueLabelTable); dbgLog.fine("***** parseValueLabelsRelease105(): end *****"); }
From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReader.java
void parseValueLabelsRelease105(BufferedInputStream stream) throws IOException { dbgLog.fine("parseValueLabelsRelease105(): start"); if (stream == null) { throw new IllegalArgumentException("stream == null!"); }/* w w w . j a va2s . c om*/ int nvar = dataTable.getVarQuantity().intValue(); int length_label_name = constantTable.get("NAME") + 1; // note: caution +1 as the null character, not 9 byte int length_value_label_header = value_label_table_length + length_label_name; if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("value_label_table_length=" + value_label_table_length); } if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("length_value_label_header=" + length_value_label_header); } int length_lable_name_field = 8; /* Seg field byte type 1-1. no of pairs 2 int (= m) 1-2. vlt_name 10 includes char+(\0) == name used in Sec2.part 5 ----------------------------------- 11 2-1. values 2*n int[] 2-2. labels 8*n char */ // This map will hold a temporary lookup table for all the categorical // value-label groups we are going to find here: // These groups have unique names, and a group *may be shared* between // multiple variables. In the method decodeDescriptorValueLabel above // we have populated a lookup table where variables are linked to the // corresponding value-label groups by name. Thus we must fully populate // the full map of all the variable groups, then go through the list // of variables and create the dataverse variable categories from // them. -- L.A. 4.0 Map<String, Map<String, String>> tempValueLabelTable = new LinkedHashMap<>(); for (int i = 0; i < nvar; i++) { if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("\n\n" + i + "th value-label table header"); } byte[] valueLabelHeader = new byte[length_value_label_header]; // Part 1: reading the header of a value-label table if exists int nbytes = stream.read(valueLabelHeader, 0, length_value_label_header); if (nbytes == 0) { throw new IOException("reading value label header: no datum"); } // 1.1 number of value-label pairs in this table (= m) ByteBuffer bb_value_label_pairs = ByteBuffer.wrap(valueLabelHeader, 0, value_label_table_length); if (isLittleEndian) { bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN); //if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value lable table lenth: byte reversed"); } int no_value_label_pairs = bb_value_label_pairs.getShort(); if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs); } // 1.2 labelName String rawLabelName = new String(Arrays.copyOfRange(valueLabelHeader, value_label_table_length, (value_label_table_length + length_label_name)), "ISO-8859-1"); if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("rawLabelName(length)=" + rawLabelName.length()); } String labelName = rawLabelName.substring(0, rawLabelName.indexOf(0)); if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("label name = " + labelName + "\n"); } if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine(i + "-th value-label table"); } // Part 2: reading the value-label table // the length of the value-label table is: 2*m + 8*m = 10*m int length_value_label_table = (value_label_table_length + length_lable_name_field) * no_value_label_pairs; if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("length_value_label_table=" + length_value_label_table); } byte[] valueLabelTable_i = new byte[length_value_label_table]; int noBytes = stream.read(valueLabelTable_i, 0, length_value_label_table); if (noBytes == 0) { throw new IOException("reading value label table: no datum"); } // 2-1. 2-byte-integer array (2*m): value array (sorted) short[] valueList = new short[no_value_label_pairs]; int offset_value = 0; for (int k = 0; k < no_value_label_pairs; k++) { ByteBuffer bb_value_list = ByteBuffer.wrap(valueLabelTable_i, offset_value, value_label_table_length); if (isLittleEndian) { bb_value_list.order(ByteOrder.LITTLE_ENDIAN); } valueList[k] = bb_value_list.getShort(); offset_value += value_label_table_length; } if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("value_list=" + Arrays.toString(valueList) + "\n"); } // 2-2. 8-byte chars that store label data (m units of labels) if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("current offset_value=" + offset_value); } int offset_start = offset_value; int offset_end = offset_value + length_lable_name_field; String[] labelList = new String[no_value_label_pairs]; for (int l = 0; l < no_value_label_pairs; l++) { String string_l = new String(Arrays.copyOfRange(valueLabelTable_i, offset_start, offset_end), "ISO-8859-1"); int null_position = string_l.indexOf(0); if (null_position != -1) { labelList[l] = string_l.substring(0, null_position); } else { labelList[l] = string_l; } offset_start = offset_end; offset_end += length_lable_name_field; } // Finally, we've reached the actual value-label pairs. We'll go // through them and put them on the temporary lookup map: tempValueLabelTable.put(labelName, new LinkedHashMap<>()); for (int j = 0; j < no_value_label_pairs; j++) { if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine(j + "-th pair:" + valueList[j] + "[" + labelList[j] + "]"); } // TODO: do we need any null/empty string checks here? -- L.A. 4.0 tempValueLabelTable.get(labelName).put(Integer.toString(valueList[j]), labelList[j]); } if (stream.available() == 0) { // reached the end of the file if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("reached the end of file at " + i + "th value-label Table."); } break; } } // for nvar loop // And now we can go through the list of variables, see if any have // value-label groups linked, then build dataverse VariableCategory // objects for them, using the values stored in the temporary map // we've just built: for (int i = 0; i < nvar; i++) { if (valueLabelsLookupTable[i] != null) { if (tempValueLabelTable.get(valueLabelsLookupTable[i]) != null) { // What if it is null? -- is it a legit condition, that // a variable was advertised as having categorical values, // but no such cat value group exists under this name? // -- L.A. for (String value : tempValueLabelTable.get(valueLabelsLookupTable[i]).keySet()) { VariableCategory cat = new VariableCategory(); cat.setValue(value); cat.setLabel(tempValueLabelTable.get(valueLabelsLookupTable[i]).get(value)); /* cross-link the variable and category to each other: */ cat.setDataVariable(dataTable.getDataVariables().get(i)); dataTable.getDataVariables().get(i).getCategories().add(cat); } } } } dbgLog.fine("parseValueLabelsRelease105(): end"); }
From source file:edu.harvard.iq.dvn.ingest.statdataio.impl.plugins.dta.DTAFileReader.java
private void parseValueLabelsReleasel108(BufferedInputStream stream) throws IOException { dbgLog.fine("***** parseValueLabelsRelease108(): start *****"); if (stream == null) { throw new IllegalArgumentException("stream == null!"); }/*from ww w . j a v a2s. co m*/ int nvar = (Integer) smd.getFileInformation().get("varQnty"); int length_label_name = constantTable.get("NAME"); int length_value_label_header = value_label_table_length + length_label_name + VALUE_LABEL_HEADER_PADDING_LENGTH; if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value_label_table_length=" + value_label_table_length); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length_value_label_header=" + length_value_label_header); /* Seg field byte type 1-1. len_vlt(Seg.2) 4 int 1-2. vlt_name 9/33 char+(\0) == name used in Sec2.part 5 1-3. padding 3 byte ----------------------------------- 16/40 2-1. n(# of vls) 4 int 2-2. m(len_labels) 4 int 2-3. label_offsets 4*n int[] 2-4. values 4*n int[] 2-5. labels m char */ for (int i = 0; i < nvar; i++) { if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("\n\n" + i + "th value-label table header"); byte[] valueLabelHeader = new byte[length_value_label_header]; // Part 1: reading the header of a value-label table if exists int nbytes = stream.read(valueLabelHeader, 0, length_value_label_header); if (nbytes == 0) { throw new IOException("reading value label header: no datum"); } // 1.1 length_value_label_table ByteBuffer bb_value_label_header = ByteBuffer.wrap(valueLabelHeader, 0, value_label_table_length); if (isLittleEndian) { bb_value_label_header.order(ByteOrder.LITTLE_ENDIAN); } int length_value_label_table = bb_value_label_header.getInt(); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("length of this value-label table=" + length_value_label_table); // 1.2 labelName String rawLabelName = new String(Arrays.copyOfRange(valueLabelHeader, value_label_table_length, (value_label_table_length + length_label_name)), "ISO-8859-1"); String labelName = getNullStrippedString(rawLabelName); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("label name = " + labelName + "\n"); if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(i + "-th value-label table"); // Part 2: reading the value-label table byte[] valueLabelTable_i = new byte[length_value_label_table]; int noBytes = stream.read(valueLabelTable_i, 0, length_value_label_table); if (noBytes == 0) { throw new IOException("reading value label table: no datum"); } // 2-1. 4-byte-integer: number of units in this table (n) int valueLabelTable_offset = 0; ByteBuffer bb_value_label_pairs = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset, value_label_table_length); if (isLittleEndian) { bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN); } int no_value_label_pairs = bb_value_label_pairs.getInt(); valueLabelTable_offset += value_label_table_length; if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs); // 2-2. 4-byte-integer: length of the label section (m bytes) ByteBuffer bb_length_label_segment = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset, value_label_table_length); if (isLittleEndian) { bb_length_label_segment.order(ByteOrder.LITTLE_ENDIAN); } int length_label_segment = bb_length_label_segment.getInt(); valueLabelTable_offset += value_label_table_length; // 2-3. 4-byte-integer array (4xm): offset values for the label sec. // these "label offsets" actually appear to represent the byte // offsets of the label strings, as stored in the next section. // as of now, these are not used for anything, and the code // below assumes that the labels are already in the same // order as the numeric values! -- L.A. int[] label_offsets = new int[no_value_label_pairs]; int byte_offset = valueLabelTable_offset; for (int j = 0; j < no_value_label_pairs; j++) { // note: 4-byte singed, not java's long ByteBuffer bb_label_offset = ByteBuffer.wrap(valueLabelTable_i, byte_offset, value_label_table_length); if (isLittleEndian) { bb_label_offset.order(ByteOrder.LITTLE_ENDIAN); dbgLog.fine("label offset: byte reversed"); } label_offsets[j] = bb_label_offset.getInt(); dbgLog.fine("label offset [" + j + "]: " + label_offsets[j]); byte_offset += value_label_table_length; } // 2-4. 4-byte-integer array (4xm): value array (sorted) dbgLog.fine("value array"); int[] valueList = new int[no_value_label_pairs]; int offset_value = byte_offset; for (int k = 0; k < no_value_label_pairs; k++) { ByteBuffer bb_value_list = ByteBuffer.wrap(valueLabelTable_i, offset_value, value_label_table_length); if (isLittleEndian) { bb_value_list.order(ByteOrder.LITTLE_ENDIAN); } valueList[k] = bb_value_list.getInt(); offset_value += value_label_table_length; } // 2-5. m-byte chars that store label data (m units of labels) String label_segment = new String( Arrays.copyOfRange(valueLabelTable_i, offset_value, (length_label_segment + offset_value)), "ISO-8859-1"); // L.A. -- 2011.2.25: // This assumes that the labels are already stored in the right // order: (see my comment for the section 2.3 above) //String[] labelList = label_segment.split("\0"); // Instead, we should be using the offset values obtained in // the section 2.3 above, and select the corresponding // substrings: String[] labelList = new String[no_value_label_pairs]; for (int l = 0; l < no_value_label_pairs; l++) { String lblString = null; int lblOffset = label_offsets[l]; lblString = label_segment.substring(lblOffset); int nullIndx = lblString.indexOf('\000'); if (nullIndx > -1) { lblString = lblString.substring(0, nullIndx); } labelList[l] = lblString; } // this should work! -- L.A. // (TODO: check the v105 value label parsing method, to see if // something similar applies there) Map<String, String> tmpValueLabelTable = new LinkedHashMap<String, String>(); for (int l = 0; l < no_value_label_pairs; l++) { if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine(l + "-th pair:" + valueList[l] + "[" + labelList[l] + "]"); tmpValueLabelTable.put(Integer.toString(valueList[l]), labelList[l]); } valueLabelTable.put(labelName, tmpValueLabelTable); if (stream.available() == 0) { // reached the end of this file // do exit-processing dbgLog.fine("***** reached the end of the file at " + i + "th value-label Table *****"); break; } } // for loop if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("valueLabelTable:\n" + valueLabelTable); smd.setValueLabelTable(valueLabelTable); dbgLog.fine("***** parseValueLabelsRelease108(): end *****"); }
From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.dta.DTAFileReader.java
private void parseValueLabelsReleasel108(BufferedInputStream stream) throws IOException { dbgLog.fine("parseValueLabelsRelease108(): start"); if (stream == null) { throw new IllegalArgumentException("stream == null!"); }/* w ww . j av a 2s.c om*/ int nvar = dataTable.getVarQuantity().intValue(); int length_label_name = constantTable.get("NAME"); int length_value_label_header = value_label_table_length + length_label_name + VALUE_LABEL_HEADER_PADDING_LENGTH; if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("value_label_table_length=" + value_label_table_length); } if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("length_value_label_header=" + length_value_label_header); } /* Seg field byte type 1-1. len_vlt(Seg.2) 4 int 1-2. vlt_name 9/33 char+(\0) == name used in Sec2.part 5 1-3. padding 3 byte ----------------------------------- 16/40 2-1. n(# of vls) 4 int 2-2. m(len_labels) 4 int 2-3. label_offsets 4*n int[] 2-4. values 4*n int[] 2-5. labels m char */ // This map will hold a temporary lookup table for all the categorical // value-label groups: // These groups have unique names, and a group *may be shared* between // multiple variables. In the method decodeDescriptorValueLabel above // we have populated a lookup table where variables are linked to the // corresponding value-label groups by name. Thus we must fully populate // the full map of all the variable group, then go through the list // of variables and create the dataverse variable categories from // them. -- L.A. 4.0 Map<String, Map<String, String>> tempValueLabelTable = new LinkedHashMap<>(); for (int i = 0; i < nvar; i++) { if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("\n\n" + i + "th value-label table header"); } byte[] valueLabelHeader = new byte[length_value_label_header]; // Part 1: reading the header of a value-label table if exists int nbytes = stream.read(valueLabelHeader, 0, length_value_label_header); if (nbytes == 0) { throw new IOException("reading value label header: no datum"); } // 1.1 length_value_label_table ByteBuffer bb_value_label_header = ByteBuffer.wrap(valueLabelHeader, 0, value_label_table_length); if (isLittleEndian) { bb_value_label_header.order(ByteOrder.LITTLE_ENDIAN); } int length_value_label_table = bb_value_label_header.getInt(); if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("length of this value-label table=" + length_value_label_table); } // 1.2 labelName String rawLabelName = new String(Arrays.copyOfRange(valueLabelHeader, value_label_table_length, (value_label_table_length + length_label_name)), "ISO-8859-1"); String labelName = getNullStrippedString(rawLabelName); if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("label name = " + labelName + "\n"); } if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine(i + "-th value-label table"); } // Part 2: reading the value-label table byte[] valueLabelTable_i = new byte[length_value_label_table]; int noBytes = stream.read(valueLabelTable_i, 0, length_value_label_table); if (noBytes == 0) { throw new IOException("reading value label table: no datum"); } // 2-1. 4-byte-integer: number of units in this table (n) int valueLabelTable_offset = 0; ByteBuffer bb_value_label_pairs = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset, value_label_table_length); if (isLittleEndian) { bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN); } int no_value_label_pairs = bb_value_label_pairs.getInt(); valueLabelTable_offset += value_label_table_length; if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs); } // 2-2. 4-byte-integer: length of the label section (m bytes) ByteBuffer bb_length_label_segment = ByteBuffer.wrap(valueLabelTable_i, valueLabelTable_offset, value_label_table_length); if (isLittleEndian) { bb_length_label_segment.order(ByteOrder.LITTLE_ENDIAN); } int length_label_segment = bb_length_label_segment.getInt(); valueLabelTable_offset += value_label_table_length; // 2-3. 4-byte-integer array (4xm): offset values for the label sec. // these "label offsets" actually appear to represent the byte // offsets of the label strings, as stored in the next section. // as of now, these are not used for anything, and the code // below assumes that the labels are already in the same // order as the numeric values! -- L.A. int[] label_offsets = new int[no_value_label_pairs]; int byte_offset = valueLabelTable_offset; for (int j = 0; j < no_value_label_pairs; j++) { // note: 4-byte singed, not java's long ByteBuffer bb_label_offset = ByteBuffer.wrap(valueLabelTable_i, byte_offset, value_label_table_length); if (isLittleEndian) { bb_label_offset.order(ByteOrder.LITTLE_ENDIAN); dbgLog.fine("label offset: byte reversed"); } label_offsets[j] = bb_label_offset.getInt(); dbgLog.fine("label offset [" + j + "]: " + label_offsets[j]); byte_offset += value_label_table_length; } // 2-4. 4-byte-integer array (4xm): value array (sorted) dbgLog.fine("value array"); int[] valueList = new int[no_value_label_pairs]; int offset_value = byte_offset; for (int k = 0; k < no_value_label_pairs; k++) { ByteBuffer bb_value_list = ByteBuffer.wrap(valueLabelTable_i, offset_value, value_label_table_length); if (isLittleEndian) { bb_value_list.order(ByteOrder.LITTLE_ENDIAN); } valueList[k] = bb_value_list.getInt(); offset_value += value_label_table_length; } // 2-5. m-byte chars that store label data (m units of labels) String label_segment = new String( Arrays.copyOfRange(valueLabelTable_i, offset_value, (length_label_segment + offset_value)), "ISO-8859-1"); // L.A. -- 2011.2.25: // This assumes that the labels are already stored in the right // order: (see my comment for the section 2.3 above) //String[] labelList = label_segment.split("\0"); // Instead, we should be using the offset values obtained in // the section 2.3 above, and select the corresponding // substrings: String[] labelList = new String[no_value_label_pairs]; for (int l = 0; l < no_value_label_pairs; l++) { String lblString = null; int lblOffset = label_offsets[l]; lblString = label_segment.substring(lblOffset); int nullIndx = lblString.indexOf('\000'); if (nullIndx > -1) { lblString = lblString.substring(0, nullIndx); } labelList[l] = lblString; } // this should work! -- L.A. // (TODO: check the v105 value label parsing method, to see if // something similar applies there) // Finally, we've reached the actual value-label pairs. We'll go // through them and put them on the temporary lookup map: tempValueLabelTable.put(labelName, new LinkedHashMap<>()); for (int l = 0; l < no_value_label_pairs; l++) { if (dbgLog.isLoggable(Level.FINE)) { dbgLog.fine(l + "-th pair:" + valueList[l] + "[" + labelList[l] + "]"); } // TODO: do we need any null/empty string checks here? -- L.A. 4.0 tempValueLabelTable.get(labelName).put(Integer.toString(valueList[l]), labelList[l]); } if (stream.available() == 0) { // reached the end of the file dbgLog.fine("reached the end of the file at " + i + "th value-label Table"); break; } } // for nvar loop // And now we can go through the list of variables, see if any have // value-label groups linked, then build dataverse VariableCategory // objects for them, using the values stored in the temporary map // we've just built: // TODO: this code is duplicated between this, and the "105 version" of // this method, above. Maybe it should be isolated in its own method. // -- L.A. 4.0 for (int i = 0; i < nvar; i++) { if (valueLabelsLookupTable[i] != null) { if (tempValueLabelTable.get(valueLabelsLookupTable[i]) != null) { // What if it is null? -- is it a legit condition, that // a variable was advertised as having categorical values, // but no such cat value group exists under this name? // -- L.A. for (String value : tempValueLabelTable.get(valueLabelsLookupTable[i]).keySet()) { VariableCategory cat = new VariableCategory(); cat.setValue(value); cat.setLabel(tempValueLabelTable.get(valueLabelsLookupTable[i]).get(value)); /* cross-link the variable and category to each other: */ cat.setDataVariable(dataTable.getDataVariables().get(i)); dataTable.getDataVariables().get(i).getCategories().add(cat); } } } } dbgLog.fine("parseValueLabelsRelease108(): end"); }
From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReader.java
void decodeRecordTypeDataUnCompressed(BufferedInputStream stream) throws IOException { dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): start *****"); if (stream == null) { throw new IllegalArgumentException("decodeRecordTypeDataUnCompressed: stream == null!"); }// w w w.j a v a 2 s . c o m int varQnty = dataTable.getVarQuantity().intValue(); // // set-up tab file PrintWriter pwout = createOutputWriter(stream); boolean hasStringVarContinuousBlock = obsNonVariableBlockSet.size() > 0 ? true : false; dbgLog.fine("hasStringVarContinuousBlock=" + hasStringVarContinuousBlock); int ii = 0; int OBS = LENGTH_SAV_OBS_BLOCK; int nOBS = OBSUnitsPerCase; dbgLog.fine("OBSUnitsPerCase=" + OBSUnitsPerCase); int caseIndex = 0; dbgLog.fine("printFormatTable:\n" + printFormatTable); variableFormatTypeList = new String[varQnty]; dateFormatList = new String[varQnty]; for (int i = 0; i < varQnty; i++) { variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE .get(printFormatTable.get(variableNameList.get(i))); dbgLog.fine("i=" + i + "th variableFormatTypeList=" + variableFormatTypeList[i]); formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); } dbgLog.fine("variableFormatType:\n" + Arrays.deepToString(variableFormatTypeList)); dbgLog.fine("formatCategoryTable:\n" + formatCategoryTable); int numberOfDecimalVariables = 0; // TODO: // Make sure the date formats are actually preserved! // (this is something that was collected in the code below and passed // to the UNF calculator). // -- L.A. 4.0 alpha List<String> casewiseRecordForTabFile = new ArrayList<String>(); // missing values are written to the tab-delimited file by // using the default or user-specified missing-value strings; // however, to calculate UNF/summary statistics, // classes for these calculations require their specific // missing values that differ from the above missing-value // strings; therefore, after row data for the tab-delimited // file are written, missing values in a row are changed to // UNF/summary-statistics-OK ones. // data-storage object for sumStat ///dataTable2 = new Object[varQnty][caseQnty]; // storage of date formats to pass to UNF ///dateFormats = new String[varQnty][caseQnty]; try { for (int i = 0;; i++) { // case-wise loop byte[] buffer = new byte[OBS * nOBS]; int nbytesuc = stream.read(buffer); StringBuilder sb_stringStorage = new StringBuilder(""); for (int k = 0; k < nOBS; k++) { int offset = OBS * k; // uncompressed case // numeric missing value == sysmis // FF FF FF FF FF FF eF FF(little endian) // string missing value // 20 20 20 20 20 20 20 20 // cf: compressed case // numeric type:sysmis == 0xFF // string type: missing value == 0xFE // boolean isNumeric = OBSwiseTypelList.get(k) == 0 ? true : false; if (isNumeric) { dbgLog.finer(k + "-th variable is numeric"); // interprete as double ByteBuffer bb_double = ByteBuffer.wrap(buffer, offset, LENGTH_SAV_OBS_BLOCK); if (isLittleEndian) { bb_double.order(ByteOrder.LITTLE_ENDIAN); } //char[] hexpattern = String dphex = new String(Hex.encodeHex( Arrays.copyOfRange(bb_double.array(), offset, offset + LENGTH_SAV_OBS_BLOCK))); dbgLog.finer("dphex=" + dphex); if ((dphex.equals("ffffffffffffefff")) || (dphex.equals("ffefffffffffffff"))) { //casewiseRecordForTabFile.add(systemMissingValue); // add the numeric missing value dbgLog.fine("SAV Reader: adding: Missing Value (numeric)"); casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); } else { Double ddatum = bb_double.getDouble(); dbgLog.fine("SAV Reader: adding: ddatum=" + ddatum); // add this non-missing-value numeric datum casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)); } } else { dbgLog.finer(k + "-th variable is string"); // string case // strip space-padding // do not trim: string might have spaces within it // the missing value (hex) for a string variable is: // "20 20 20 20 20 20 20 20" String strdatum = new String( Arrays.copyOfRange(buffer, offset, (offset + LENGTH_SAV_OBS_BLOCK)), defaultCharSet); dbgLog.finer("str_datum=" + strdatum); // add this non-missing-value string datum casewiseRecordForTabFile.add(strdatum); } // if isNumeric } // k-loop // String-variable's continuous block exits: if (hasStringVarContinuousBlock) { // continuous blocks: string case // concatenating process //dbgLog.fine("concatenating process starts"); //dbgLog.fine("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); //dbgLog.fine("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); StringBuilder sb = new StringBuilder(""); int firstPosition = 0; Set<Integer> removeJset = new HashSet<Integer>(); for (int j = 0; j < nOBS; j++) { dbgLog.finer("j=" + j + "-th type =" + OBSwiseTypelList.get(j)); if (OBSwiseTypelList.get(j) == -1) { // String continued fount at j-th // look back the j-1 firstPosition = j - 1; int lastJ = j; String concatanated = null; removeJset.add(j); sb.append(casewiseRecordForTabFile.get(j - 1)); sb.append(casewiseRecordForTabFile.get(j)); for (int jc = 1;; jc++) { if (OBSwiseTypelList.get(j + jc) != -1) { // j is the end unit of this string variable concatanated = sb.toString(); sb.setLength(0); lastJ = j + jc; break; } else { sb.append(casewiseRecordForTabFile.get(j + jc)); removeJset.add(j + jc); } } casewiseRecordForTabFile.set(j - 1, concatanated); //out.println(j-1+"th concatanated="+concatanated); j = lastJ - 1; } // end-of-if: continuous-OBS only } // end of loop-j List<String> newDataLine = new ArrayList<String>(); for (int jl = 0; jl < casewiseRecordForTabFile.size(); jl++) { //out.println("jl="+jl+"-th datum =["+casewiseRecordForTabFile.get(jl)+"]"); if (!removeJset.contains(jl)) { newDataLine.add(casewiseRecordForTabFile.get(jl)); } } dbgLog.fine("new casewiseRecordForTabFile=" + newDataLine); dbgLog.fine("new casewiseRecordForTabFile(size)=" + newDataLine.size()); casewiseRecordForTabFile = newDataLine; } // end-if: stringContinuousVar-exist case caseIndex++; dbgLog.finer("caseIndex=" + caseIndex); for (int k = 0; k < casewiseRecordForTabFile.size(); k++) { if (variableTypelList.get(k) > 0) { // See my comments for this padding removal logic // in the "compressed" method -- L.A. String paddRemoved = StringUtils.stripEnd(casewiseRecordForTabFile.get(k).toString(), null); // TODO: clean this up. For now, just make sure that strings contain at least one blank space. if (paddRemoved.equals("")) { paddRemoved = " "; } //casewiseRecordForTabFile.set(k, "\"" + paddRemoved.replaceAll("\"", Matcher.quoteReplacement("\\\"")) + "\""); casewiseRecordForTabFile.set(k, escapeCharacterString(paddRemoved)); // end of String var case } // end of variable-type check if (casewiseRecordForTabFile.get(k) != null && !casewiseRecordForTabFile.get(k).equals(MissingValueForTextDataFileNumeric)) { // to do date conversion String variableFormatType = variableFormatTypeList[k]; dbgLog.finer("k=" + k + "th variable format=" + variableFormatType); int formatDecimalPointPosition = formatDecimalPointPositionList.get(k); if (variableFormatType.equals("date")) { dbgLog.finer("date case"); long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymd.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); dateFormatList[k] = sdf_ymd.toPattern(); } else if (variableFormatType.equals("time")) { dbgLog.finer("time case:DTIME or DATETIME or TIME"); //formatCategoryTable.put(variableNameList.get(k), "time"); // not treating DTIME as date/time; see comment elsewhere in // the code; // (but we do need to remember to treat the resulting values // as character strings, not numerics!) if (printFormatTable.get(variableNameList.get(k)).equals("DTIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_BIAS; String newDatum = sdf_dhms.format(new Date(dateDatum)); // Note: DTIME is not a complete date, so we don't save a date format with it dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_BIAS; StringBuilder sb_time = new StringBuilder(sdf_dhms.format(new Date(dateDatum))); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")) { // TODO: // (for both datetime and "dateless" time) // keep the longest of the matching formats - i.e., if there are *some* // values in the vector that have thousands of a second, that should be // part of the saved format! // -- L.A. Aug. 12 2014 if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymdhms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); dateFormatList[k] = sdf_ymdhms.toPattern(); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_OFFSET; StringBuilder sb_time = new StringBuilder( sdf_ymdhms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); // datetime with milliseconds: dateFormatList[k] = sdf_ymdhms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); } } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L; String newDatum = sdf_hms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); if (dateFormatList[k] == null) { dateFormatList[k] = sdf_hms.toPattern(); } } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L; StringBuilder sb_time = new StringBuilder(sdf_hms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); // time, possibly with milliseconds: String format_hmsS = sdf_hms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); if (dateFormatList[k] == null || (format_hmsS.length() > dateFormatList[k].length())) { dateFormatList[k] = format_hmsS; } } } } else if (variableFormatType.equals("other")) { dbgLog.finer("other non-date/time case"); if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")) { // day of week dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.WEEKDAY_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.WEEKDAY_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); dbgLog.finer("wkday:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")) { // month dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.MONTH_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.MONTH_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); dbgLog.finer("month:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } } // end of date/time block } // end: date-time-datum check } // end: loop-k(2nd: variablte-wise-check) // write to tab file if (casewiseRecordForTabFile.size() > 0) { pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); } // numeric contents-check for (int l = 0; l < casewiseRecordForTabFile.size(); l++) { if (variableFormatTypeList[l].equals("date") || variableFormatTypeList[l].equals("time") || printFormatTable.get(variableNameList.get(l)).equals("WKDAY") || printFormatTable.get(variableNameList.get(l)).equals("MONTH")) { } else { if (variableTypelList.get(l) <= 0) { if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0) { decimalVariableSet.add(l); } } } } // reset the case-wise working objects casewiseRecordForTabFile.clear(); if (stream.available() == 0) { // reached the end of this file // do exit-processing dbgLog.fine("reached the end of the file at " + ii + "th iteration"); break; } // if eof processing } //i-loop: case(row) iteration // close the writer pwout.close(); } catch (IOException ex) { throw ex; } // contents check dbgLog.fine("numberOfDecimalVariables=" + numberOfDecimalVariables); dbgLog.fine("decimalVariableSet=" + decimalVariableSet); dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): end *****"); }
From source file:edu.harvard.iq.dvn.ingest.statdataio.impl.plugins.sav.SAVFileReader.java
void decodeRecordTypeDataUnCompressed(BufferedInputStream stream) throws IOException { dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): start *****"); if (stream == null) { throw new IllegalArgumentException("decodeRecordTypeDataUnCompressed: stream == null!"); }/*from w ww . j a v a 2 s.co m*/ Map<String, String> formatCategoryTable = new LinkedHashMap<String, String>(); // // set-up tab file PrintWriter pwout = createOutputWriter(stream); boolean hasStringVarContinuousBlock = obsNonVariableBlockSet.size() > 0 ? true : false; dbgLog.fine("hasStringVarContinuousBlock=" + hasStringVarContinuousBlock); int ii = 0; int OBS = LENGTH_SAV_OBS_BLOCK; int nOBS = OBSUnitsPerCase; dbgLog.fine("OBSUnitsPerCase=" + OBSUnitsPerCase); int caseIndex = 0; dbgLog.fine("printFormatTable:\n" + printFormatTable); dbgLog.fine("printFormatNameTable:\n" + printFormatNameTable); variableFormatTypeList = new String[varQnty]; for (int i = 0; i < varQnty; i++) { variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE .get(printFormatTable.get(variableNameList.get(i))); dbgLog.fine("i=" + i + "th variableFormatTypeList=" + variableFormatTypeList[i]); formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); } dbgLog.fine("variableFormatType:\n" + Arrays.deepToString(variableFormatTypeList)); dbgLog.fine("formatCategoryTable:\n" + formatCategoryTable); // contents (variable) checker concering decimals variableTypeFinal = new int[varQnty]; Arrays.fill(variableTypeFinal, 0); int numberOfDecimalVariables = 0; List<String> casewiseRecordForTabFile = new ArrayList<String>(); String[] caseWiseDateFormatForUNF = null; List<String> casewiseRecordForUNF = new ArrayList<String>(); // missing values are written to the tab-delimited file by // using the default or user-specified missing-value strings; // however, to calculate UNF/summary statistics, // classes for these calculations require their specific // missing values that differ from the above missing-value // strings; therefore, after row data for the tab-delimited // file are written, missing values in a row are changed to // UNF/summary-statistics-OK ones. // data-storage object for sumStat dataTable2 = new Object[varQnty][caseQnty]; // storage of date formats to pass to UNF dateFormats = new String[varQnty][caseQnty]; try { for (int i = 0;; i++) { // case-wise loop byte[] buffer = new byte[OBS * nOBS]; int nbytesuc = stream.read(buffer); StringBuilder sb_stringStorage = new StringBuilder(""); for (int k = 0; k < nOBS; k++) { int offset = OBS * k; // uncompressed case // numeric missing value == sysmis // FF FF FF FF FF FF eF FF(little endian) // string missing value // 20 20 20 20 20 20 20 20 // cf: compressed case // numeric type:sysmis == 0xFF // string type: missing value == 0xFE // boolean isNumeric = OBSwiseTypelList.get(k) == 0 ? true : false; if (isNumeric) { dbgLog.finer(k + "-th variable is numeric"); // interprete as double ByteBuffer bb_double = ByteBuffer.wrap(buffer, offset, LENGTH_SAV_OBS_BLOCK); if (isLittleEndian) { bb_double.order(ByteOrder.LITTLE_ENDIAN); } //char[] hexpattern = String dphex = new String(Hex.encodeHex( Arrays.copyOfRange(bb_double.array(), offset, offset + LENGTH_SAV_OBS_BLOCK))); dbgLog.finer("dphex=" + dphex); if ((dphex.equals("ffffffffffffefff")) || (dphex.equals("ffefffffffffffff"))) { //casewiseRecordForTabFile.add(systemMissingValue); // add the numeric missing value dbgLog.fine("SAV Reader: adding: Missing Value (numeric)"); casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); } else { Double ddatum = bb_double.getDouble(); dbgLog.fine("SAV Reader: adding: ddatum=" + ddatum); // add this non-missing-value numeric datum casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)); } } else { dbgLog.finer(k + "-th variable is string"); // string case // strip space-padding // do not trim: string might have spaces within it // the missing value (hex) for a string variable is: // "20 20 20 20 20 20 20 20" String strdatum = new String( Arrays.copyOfRange(buffer, offset, (offset + LENGTH_SAV_OBS_BLOCK)), defaultCharSet); dbgLog.finer("str_datum=" + strdatum); // add this non-missing-value string datum casewiseRecordForTabFile.add(strdatum); } // if isNumeric } // k-loop // String-variable's continuous block exits: if (hasStringVarContinuousBlock) { // continuous blocks: string case // concatenating process //dbgLog.fine("concatenating process starts"); //dbgLog.fine("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); //dbgLog.fine("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); StringBuilder sb = new StringBuilder(""); int firstPosition = 0; Set<Integer> removeJset = new HashSet<Integer>(); for (int j = 0; j < nOBS; j++) { dbgLog.finer("j=" + j + "-th type =" + OBSwiseTypelList.get(j)); if (OBSwiseTypelList.get(j) == -1) { // String continued fount at j-th // look back the j-1 firstPosition = j - 1; int lastJ = j; String concatanated = null; removeJset.add(j); sb.append(casewiseRecordForTabFile.get(j - 1)); sb.append(casewiseRecordForTabFile.get(j)); for (int jc = 1;; jc++) { if (OBSwiseTypelList.get(j + jc) != -1) { // j is the end unit of this string variable concatanated = sb.toString(); sb.setLength(0); lastJ = j + jc; break; } else { sb.append(casewiseRecordForTabFile.get(j + jc)); removeJset.add(j + jc); } } casewiseRecordForTabFile.set(j - 1, concatanated); //out.println(j-1+"th concatanated="+concatanated); j = lastJ - 1; } // end-of-if: continuous-OBS only } // end of loop-j List<String> newDataLine = new ArrayList<String>(); for (int jl = 0; jl < casewiseRecordForTabFile.size(); jl++) { //out.println("jl="+jl+"-th datum =["+casewiseRecordForTabFile.get(jl)+"]"); if (!removeJset.contains(jl)) { newDataLine.add(casewiseRecordForTabFile.get(jl)); } } dbgLog.fine("new casewiseRecordForTabFile=" + newDataLine); dbgLog.fine("new casewiseRecordForTabFile(size)=" + newDataLine.size()); casewiseRecordForTabFile = newDataLine; } // end-if: stringContinuousVar-exist case for (int el = 0; el < casewiseRecordForTabFile.size(); el++) { casewiseRecordForUNF.add(casewiseRecordForTabFile.get(el)); } caseWiseDateFormatForUNF = new String[casewiseRecordForTabFile.size()]; caseIndex++; dbgLog.finer("caseIndex=" + caseIndex); for (int k = 0; k < casewiseRecordForTabFile.size(); k++) { if (variableTypelList.get(k) > 0) { // String variable case: set to -1 variableTypeFinal[k] = -1; // See my comments for this padding removal logic // in the "compressed" method -- L.A. String paddRemoved = StringUtils.stripEnd(casewiseRecordForTabFile.get(k).toString(), null); // TODO: clean this up. For now, just make sure that strings contain at least one blank space. if (paddRemoved.equals("")) { paddRemoved = " "; } casewiseRecordForUNF.set(k, paddRemoved); casewiseRecordForTabFile.set(k, "\"" + paddRemoved.replaceAll("\"", Matcher.quoteReplacement("\\\"")) + "\""); // end of String var case } else { // numeric var case if (casewiseRecordForTabFile.get(k).equals(MissingValueForTextDataFileNumeric)) { casewiseRecordForUNF.set(k, null); } } // end of variable-type check if (casewiseRecordForTabFile.get(k) != null && !casewiseRecordForTabFile.get(k).equals(MissingValueForTextDataFileNumeric)) { // to do date conversion String variableFormatType = variableFormatTypeList[k]; dbgLog.finer("k=" + k + "th variable format=" + variableFormatType); int formatDecimalPointPosition = formatDecimalPointPositionList.get(k); if (variableFormatType.equals("date")) { dbgLog.finer("date case"); long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymd.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); caseWiseDateFormatForUNF[k] = sdf_ymd.toPattern(); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); //formatCategoryTable.put(variableNameList.get(k), "date"); } else if (variableFormatType.equals("time")) { dbgLog.finer("time case:DTIME or DATETIME or TIME"); //formatCategoryTable.put(variableNameList.get(k), "time"); if (printFormatTable.get(variableNameList.get(k)).equals("DTIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_BIAS; String newDatum = sdf_dhms.format(new Date(dateDatum)); // Note: DTIME is not a complete date, so we don't save a date format with it dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_BIAS; StringBuilder sb_time = new StringBuilder(sdf_dhms.format(new Date(dateDatum))); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymdhms.format(new Date(dateDatum)); caseWiseDateFormatForUNF[k] = sdf_ymdhms.toPattern(); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_OFFSET; StringBuilder sb_time = new StringBuilder( sdf_ymdhms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } caseWiseDateFormatForUNF[k] = sdf_ymdhms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")) { if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L; String newDatum = sdf_hms.format(new Date(dateDatum)); caseWiseDateFormatForUNF[k] = sdf_hms.toPattern(); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString().split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L; StringBuilder sb_time = new StringBuilder(sdf_hms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append("." + timeData[1].substring(0, formatDecimalPointPosition)); } caseWiseDateFormatForUNF[k] = this.sdf_hms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); casewiseRecordForUNF.set(k, sb_time.toString()); } } } else if (variableFormatType.equals("other")) { dbgLog.finer("other non-date/time case"); if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")) { // day of week dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.WEEKDAY_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.WEEKDAY_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); dbgLog.finer("wkday:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")) { // month dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.MONTH_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.MONTH_LIST .get(Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); casewiseRecordForUNF.set(k, newDatum); dbgLog.finer("month:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } } // end of date/time block } // end: date-time-datum check } // end: loop-k(2nd: variablte-wise-check) // write to tab file if (casewiseRecordForTabFile.size() > 0) { pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); } if (casewiseRecordForTabFile.size() > 0) { for (int ij = 0; ij < varQnty; ij++) { dataTable2[ij][caseIndex - 1] = casewiseRecordForUNF.get(ij); if (variableFormatTypeList[ij].equals("date") || variableFormatTypeList[ij].equals("time")) { this.dateFormats[ij][caseIndex - 1] = caseWiseDateFormatForUNF[ij]; } } } // numeric contents-check for (int l = 0; l < casewiseRecordForTabFile.size(); l++) { if (variableFormatTypeList[l].equals("date") || variableFormatTypeList[l].equals("time") || printFormatTable.get(variableNameList.get(l)).equals("WKDAY") || printFormatTable.get(variableNameList.get(l)).equals("MONTH")) { variableTypeFinal[l] = -1; } if (variableTypeFinal[l] == 0) { if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0) { // l-th variable is not integer variableTypeFinal[l] = 1; decimalVariableSet.add(l); } } } // reset the case-wise working objects casewiseRecordForTabFile.clear(); casewiseRecordForUNF.clear(); if (stream.available() == 0) { // reached the end of this file // do exit-processing dbgLog.fine("***** reached the end of the file at " + ii + "th iteration *****"); break; } // if eof processing } //i-loop: case(row) iteration // close the writer pwout.close(); } catch (IOException ex) { throw ex; } smd.getFileInformation().put("caseQnty", caseQnty); smd.setDecimalVariables(decimalVariableSet); smd.setVariableFormatCategory(formatCategoryTable); // contents check dbgLog.fine("variableType=" + ArrayUtils.toString(variableTypeFinal)); dbgLog.fine("numberOfDecimalVariables=" + numberOfDecimalVariables); dbgLog.fine("decimalVariableSet=" + decimalVariableSet); dbgLog.fine("***** decodeRecordTypeDataUnCompressed(): end *****"); }
From source file:edu.harvard.iq.dataverse.ingest.tabulardata.impl.plugins.sav.SAVFileReader.java
void decodeRecordTypeDataCompressed(BufferedInputStream stream) throws IOException { dbgLog.fine("***** decodeRecordTypeDataCompressed(): start *****"); if (stream == null) { throw new IllegalArgumentException("decodeRecordTypeDataCompressed: stream == null!"); }/*from w ww. ja v a2 s . co m*/ PrintWriter pwout = createOutputWriter(stream); int varQnty = dataTable.getVarQuantity().intValue(); int caseQnty = dataTable.getCaseQuantity().intValue(); dbgLog.fine("varQnty: " + varQnty); dateFormatList = new String[varQnty]; boolean hasStringVarContinuousBlock = obsNonVariableBlockSet.size() > 0 ? true : false; dbgLog.fine("hasStringVarContinuousBlock=" + hasStringVarContinuousBlock); int ii = 0; int OBS = LENGTH_SAV_OBS_BLOCK; int nOBS = OBSUnitsPerCase; dbgLog.fine("OBSUnitsPerCase=" + OBSUnitsPerCase); int caseIndex = 0; dbgLog.fine("printFormatTable:\n" + printFormatTable); variableFormatTypeList = new String[varQnty]; for (int i = 0; i < varQnty; i++) { variableFormatTypeList[i] = SPSSConstants.FORMAT_CATEGORY_TABLE .get(printFormatTable.get(variableNameList.get(i))); dbgLog.fine("i=" + i + "th variableFormatTypeList=" + variableFormatTypeList[i]); formatCategoryTable.put(variableNameList.get(i), variableFormatTypeList[i]); } dbgLog.fine("variableFormatType:\n" + Arrays.deepToString(variableFormatTypeList)); dbgLog.fine("formatCategoryTable:\n" + formatCategoryTable); // TODO: // Make sure the date formats are actually preserved! // (this is something that was collected in the code below and passed // to the UNF calculator). // -- L.A. 4.0 alpha List<String> casewiseRecordForTabFile = new ArrayList<String>(); try { // this compression is applied only to non-float data, i.e. integer; // 8-byte float datum is kept in tact boolean hasReachedEOF = false; OBSERVATION: while (true) { dbgLog.fine("SAV Reader: compressed: ii=" + ii + "-th iteration"); byte[] octate = new byte[LENGTH_SAV_OBS_BLOCK]; int nbytes = stream.read(octate); // processCompressedOBSblock () // (this means process a block of 8 compressed OBS // values -- should result in 64 bytes of data total) for (int i = 0; i < LENGTH_SAV_OBS_BLOCK; i++) { dbgLog.finer("i=" + i + "-th iteration"); int octate_i = octate[i]; //dbgLog.fine("octate="+octate_i); if (octate_i < 0) { octate_i += 256; } int byteCode = octate_i;//octate_i & 0xF; //out.println("byeCode="+byteCode); // processCompressedOBS switch (byteCode) { case 252: // end of the file dbgLog.fine("SAV Reader: compressed: end of file mark [FC] was found"); hasReachedEOF = true; break; case 253: // FD: uncompressed data follows after this octate // long string datum or float datum // read the following octate byte[] uncompressedByte = new byte[LENGTH_SAV_OBS_BLOCK]; int ucbytes = stream.read(uncompressedByte); int typeIndex = (ii * OBS + i) % nOBS; if ((OBSwiseTypelList.get(typeIndex) > 0) || (OBSwiseTypelList.get(typeIndex) == -1)) { // code= >0 |-1: string or its conitiguous block // decode as a string object String strdatum = new String( Arrays.copyOfRange(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK), defaultCharSet); //out.println("str_datum="+strdatum+"<-"); // add this non-missing-value string datum casewiseRecordForTabFile.add(strdatum); //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); } else if (OBSwiseTypelList.get(typeIndex) == -2) { String strdatum = new String( Arrays.copyOfRange(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK - 1), defaultCharSet); casewiseRecordForTabFile.add(strdatum); //out.println("casewiseRecordForTabFile(String)="+casewiseRecordForTabFile); } else if (OBSwiseTypelList.get(typeIndex) == 0) { // code= 0: numeric ByteBuffer bb_double = ByteBuffer.wrap(uncompressedByte, 0, LENGTH_SAV_OBS_BLOCK); if (isLittleEndian) { bb_double.order(ByteOrder.LITTLE_ENDIAN); } Double ddatum = bb_double.getDouble(); // out.println("ddatum="+ddatum); // add this non-missing-value numeric datum casewiseRecordForTabFile.add(doubleNumberFormatter.format(ddatum)); dbgLog.fine("SAV Reader: compressed: added value to dataLine: " + ddatum); } else { dbgLog.fine("SAV Reader: out-of-range exception"); throw new IOException("out-of-range value was found"); } /* // EOF-check after reading this octate if (stream.available() == 0){ hasReachedEOF = true; dbgLog.fine( "SAV Reader: *** After reading an uncompressed octate," + " reached the end of the file at "+ii +"th iteration and i="+i+"th octate position [0-start] *****"); } */ break; case 254: // FE: used as the missing value for string variables // an empty case in a string variable also takes this value // string variable does not accept space-only data // cf: uncompressed case // 20 20 20 20 20 20 20 20 // add the string missing value // out.println("254: String missing data"); casewiseRecordForTabFile.add(" "); // add "." here? // Note that technically this byte flag (254/xFE) means // that *eight* white space characters should be // written to the output stream. This caused me // a great amount of confusion, because it appeared // to me that there was a mismatch between the number // of bytes advertised in the variable metadata and // the number of bytes actually found in the data // section of a compressed SAV file; this is because // these 8 bytes "come out of nowhere"; they are not // written in the data section, but this flag specifies // that they should be added to the output. // Also, as I pointed out above, we are only writing // out one whitespace character, not 8 as instructed. // This appears to be legit; these blocks of 8 spaces // seem to be only used for padding, and all such // multiple padding spaces are stripped anyway during // the post-processing. break; case 255: // FF: system missing value for numeric variables // cf: uncompressed case (sysmis) // FF FF FF FF FF FF eF FF(little endian) // add the numeric missing value dbgLog.fine("SAV Reader: compressed: Missing Value, numeric"); casewiseRecordForTabFile.add(MissingValueForTextDataFileNumeric); break; case 0: // 00: do nothing dbgLog.fine("SAV Reader: compressed: doing nothing (zero); "); break; default: //out.println("byte code(default)="+ byteCode); if ((byteCode > 0) && (byteCode < 252)) { // datum is compressed //Integer unCompressed = Integer.valueOf(byteCode -100); // add this uncompressed numeric datum Double unCompressed = Double.valueOf(byteCode - 100); dbgLog.fine("SAV Reader: compressed: default case: " + unCompressed); casewiseRecordForTabFile.add(doubleNumberFormatter.format(unCompressed)); // out.println("uncompressed="+unCompressed); // out.println("dataline="+casewiseRecordForTabFile); } }// end of switch // out.println("end of switch"); // The-end-of-a-case(row)-processing // this line that follows, and the code around it // is really confusing: int varCounter = (ii * OBS + i + 1) % nOBS; // while both OBS and LENGTH_SAV_OBS_BLOCK = 8 // (OBS was initialized as OBS=LENGTH_SAV_OBS_BLOCK), // the 2 values mean different things: // LENGTH_SAV_OBS_BLOCK is the number of bytes in one OBS; // and OBS is the number of OBS blocks that we process // at a time. I.e., we process 8 chunks of 8 bytes at a time. // This is how data is organized inside an SAV file: // 8 bytes of compression flags, followd by 8x8 or fewer // (depending on the flags) bytes of compressed data. // I should rename this OBS variable something more // meaningful. // // Also, the "varCounter" variable name is entirely // misleading -- it counts not variables, but OBS blocks. dbgLog.fine("SAV Reader: compressed: OBS counter=" + varCounter + "(ii=" + ii + ")"); if ((ii * OBS + i + 1) % nOBS == 0) { //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); // out.println("all variables in a case are parsed == nOBS"); // out.println("hasStringVarContinuousBlock="+hasStringVarContinuousBlock); // check whether a string-variable's continuous block exits // if so, they must be joined if (hasStringVarContinuousBlock) { // string-variable's continuous-block-concatenating-processing //out.println("concatenating process starts"); //out.println("casewiseRecordForTabFile(before)="+casewiseRecordForTabFile); //out.println("casewiseRecordForTabFile(before:size)="+casewiseRecordForTabFile.size()); StringBuilder sb = new StringBuilder(""); int firstPosition = 0; Set<Integer> removeJset = new HashSet<Integer>(); for (int j = 0; j < nOBS; j++) { dbgLog.fine("RTD: j=" + j + "-th type =" + OBSwiseTypelList.get(j)); if ((OBSwiseTypelList.get(j) == -1) || (OBSwiseTypelList.get(j) == -2)) { // Continued String variable found at j-th // position. look back the j-1 firstPosition = j - 1; int lastJ = j; String concatenated = null; removeJset.add(j); sb.append(casewiseRecordForTabFile.get(j - 1)); sb.append(casewiseRecordForTabFile.get(j)); for (int jc = 1;; jc++) { if ((j + jc == nOBS) || ((OBSwiseTypelList.get(j + jc) != -1) && (OBSwiseTypelList.get(j + jc) != -2))) { // j is the end unit of this string variable concatenated = sb.toString(); sb.setLength(0); lastJ = j + jc; break; } else { sb.append(casewiseRecordForTabFile.get(j + jc)); removeJset.add(j + jc); } } casewiseRecordForTabFile.set(j - 1, concatenated); //out.println(j-1+"th concatenated="+concatenated); j = lastJ - 1; } // end-of-if: continuous-OBS only } // end of loop-j //out.println("removeJset="+removeJset); // a new list that stores a new case with concatanated string data List<String> newDataLine = new ArrayList<String>(); for (int jl = 0; jl < casewiseRecordForTabFile.size(); jl++) { //out.println("jl="+jl+"-th datum =["+casewiseRecordForTabFile.get(jl)+"]"); if (!removeJset.contains(jl)) { // if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileString)){ // out.println("NA-S jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals(MissingValueForTextDataFileNumeric)){ // out.println("NA-N jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl)==null){ // out.println("null case jl="+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals("NaN")){ // out.println("NaN jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals("")){ // out.println("blank jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } else if (casewiseRecordForTabFile.get(jl).equals(" ")){ // out.println("space jl= "+jl+"=["+casewiseRecordForTabFile.get(jl)+"]"); // } newDataLine.add(casewiseRecordForTabFile.get(jl)); } else { // out.println("Excluded: jl="+jl+"-th datum=["+casewiseRecordForTabFile.get(jl)+"]"); } } // end of loop-jl //out.println("new casewiseRecordForTabFile="+newDataLine); //out.println("new casewiseRecordForTabFile(size)="+newDataLine.size()); casewiseRecordForTabFile = newDataLine; } // end-if: stringContinuousVar-exist case // caseIndex starts from 1 not 0 caseIndex = (ii * OBS + i + 1) / nOBS; for (int k = 0; k < casewiseRecordForTabFile.size(); k++) { dbgLog.fine("k=" + k + "-th variableTypelList=" + variableTypelList.get(k)); if (variableTypelList.get(k) > 0) { // Strip the String variables off the // whitespace padding: // [ snipped ] // I've removed the block of code above where // String values were substring()-ed to the // length specified in the variable metadata; // Doing that was not enough, since a string // can still be space-padded inside its // advertised capacity. (note that extended // variables can have many kylobytes of such // padding in them!) Plus it was completely // redundant, since we are stripping all the // trailing white spaces with // StringUtils.stripEnd() below: String paddRemoved = StringUtils .stripEnd(casewiseRecordForTabFile.get(k).toString(), null); // TODO: clean this up. For now, just make sure that strings contain at least one blank space. if (paddRemoved.equals("")) { paddRemoved = " "; } //casewiseRecordForTabFile.set(k, "\"" + paddRemoved.replaceAll("\"", Matcher.quoteReplacement("\\\"")) + "\""); casewiseRecordForTabFile.set(k, escapeCharacterString(paddRemoved)); // end of String var case } // end of variable-type check if (casewiseRecordForTabFile.get(k) != null && !casewiseRecordForTabFile.get(k) .equals(MissingValueForTextDataFileNumeric)) { String variableFormatType = variableFormatTypeList[k]; dbgLog.finer("k=" + k + "th printFormatTable format=" + printFormatTable.get(variableNameList.get(k))); int formatDecimalPointPosition = formatDecimalPointPositionList.get(k); if (variableFormatType.equals("date")) { dbgLog.finer("date case"); long dateDatum = Long.parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymd.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); /* saving date format */ dbgLog.finer("saving dateFormat[k] = " + sdf_ymd.toPattern()); casewiseRecordForTabFile.set(k, newDatum); dateFormatList[k] = sdf_ymd.toPattern(); //formatCategoryTable.put(variableNameList.get(k), "date"); } else if (variableFormatType.equals("time")) { dbgLog.finer("time case:DTIME or DATETIME or TIME"); //formatCategoryTable.put(variableNameList.get(k), "time"); if (printFormatTable.get(variableNameList.get(k)).equals("DTIME")) { // We're not even going to try to handle "DTIME" // values as time/dates in dataverse; this is a weird // format that nobody uses outside of SPSS. // (but we do need to remember to treat the resulting values // as character strings, not numerics!) if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_BIAS; String newDatum = sdf_dhms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_BIAS; StringBuilder sb_time = new StringBuilder( sdf_dhms.format(new Date(dateDatum))); dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); } } else if (printFormatTable.get(variableNameList.get(k)).equals("DATETIME")) { // TODO: // (for both datetime and "dateless" time) // keep the longest of the matching formats - i.e., if there are *some* // values in the vector that have thousands of a second, that should be // part of the saved format! // -- L.A. Aug. 12 2014 if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L - SPSS_DATE_OFFSET; String newDatum = sdf_ymdhms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); dateFormatList[k] = sdf_ymdhms.toPattern(); } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L - SPSS_DATE_OFFSET; StringBuilder sb_time = new StringBuilder( sdf_ymdhms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); dateFormatList[k] = sdf_ymdhms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); } } else if (printFormatTable.get(variableNameList.get(k)).equals("TIME")) { // TODO: // double-check that we are handling "dateless" time correctly... -- L.A. Aug. 2014 if (casewiseRecordForTabFile.get(k).toString().indexOf(".") < 0) { long dateDatum = Long .parseLong(casewiseRecordForTabFile.get(k).toString()) * 1000L; String newDatum = sdf_hms.format(new Date(dateDatum)); dbgLog.finer("k=" + k + ":" + newDatum); casewiseRecordForTabFile.set(k, newDatum); if (dateFormatList[k] == null) { dateFormatList[k] = sdf_hms.toPattern(); } } else { // decimal point included String[] timeData = casewiseRecordForTabFile.get(k).toString() .split("\\."); //dbgLog.finer(StringUtils.join(timeData, "|")); long dateDatum = Long.parseLong(timeData[0]) * 1000L; StringBuilder sb_time = new StringBuilder( sdf_hms.format(new Date(dateDatum))); //dbgLog.finer(sb_time.toString()); if (formatDecimalPointPosition > 0) { sb_time.append( "." + timeData[1].substring(0, formatDecimalPointPosition)); } dbgLog.finer("k=" + k + ":" + sb_time.toString()); casewiseRecordForTabFile.set(k, sb_time.toString()); String format_hmsS = sdf_hms.toPattern() + (formatDecimalPointPosition > 0 ? ".S" : ""); if (dateFormatList[k] == null || (format_hmsS.length() > dateFormatList[k].length())) { dateFormatList[k] = format_hmsS; } } } } else if (variableFormatType.equals("other")) { dbgLog.finer("other non-date/time case:=" + i); if (printFormatTable.get(variableNameList.get(k)).equals("WKDAY")) { // day of week dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.WEEKDAY_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.WEEKDAY_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); dbgLog.finer("wkday:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } else if (printFormatTable.get(variableNameList.get(k)).equals("MONTH")) { // month dbgLog.finer("data k=" + k + ":" + casewiseRecordForTabFile.get(k)); dbgLog.finer("data k=" + k + ":" + SPSSConstants.MONTH_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1)); String newDatum = SPSSConstants.MONTH_LIST.get( Integer.valueOf(casewiseRecordForTabFile.get(k).toString()) - 1); casewiseRecordForTabFile.set(k, newDatum); dbgLog.finer("month:k=" + k + ":" + casewiseRecordForTabFile.get(k)); } } } // end: date-time-datum check } // end: loop-k(2nd: variable-wise-check) // write to tab file if (casewiseRecordForTabFile.size() > 0) { pwout.println(StringUtils.join(casewiseRecordForTabFile, "\t")); } // numeric contents-check for (int l = 0; l < casewiseRecordForTabFile.size(); l++) { if (variableFormatTypeList[l].equals("date") || variableFormatTypeList[l].equals("time") || printFormatTable.get(variableNameList.get(l)).equals("WKDAY") || printFormatTable.get(variableNameList.get(l)).equals("MONTH")) { // TODO: // figure out if any special handling is still needed here in 4.0. // -- L.A. - Aug. 2014 } else { if (variableTypelList.get(l) <= 0) { if (casewiseRecordForTabFile.get(l).toString().indexOf(".") >= 0) { decimalVariableSet.add(l); } } } } // reset the case-wise working objects casewiseRecordForTabFile.clear(); if (caseQnty > 0) { if (caseIndex == caseQnty) { hasReachedEOF = true; } } if (hasReachedEOF) { break; } } // if(The-end-of-a-case(row)-processing) } // loop-i (OBS unit) if ((hasReachedEOF) || (stream.available() == 0)) { // reached the end of this file // do exit-processing dbgLog.fine("***** reached the end of the file at " + ii + "th iteration *****"); break OBSERVATION; } ii++; } // while loop pwout.close(); } catch (IOException ex) { throw ex; } dbgLog.fine("<<<<<<"); dbgLog.fine("formatCategoryTable = " + formatCategoryTable); dbgLog.fine(">>>>>>"); dbgLog.fine("decimalVariableSet=" + decimalVariableSet); dbgLog.fine("decodeRecordTypeDataCompressed(): end"); }