Example usage for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * /*from  w ww  .  j  a v a 2 s  .  c om*/
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path destFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (!_props.hasHeader()) {
        // simply move srcFile to destFile

        /*
         * TODO: Remove this roundabout way! 
         * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv 
         *              & the only path that exists already on HDFS is /user/biadmin/csv/.
         * In this case: the directory structure /user/biadmin/csv/temp/out must be created. 
         * Simple hdfs.rename() does not seem to create this directory structure.
         */

        // delete the destination file, if exists already
        //boolean ret1 = 
        hdfs.delete(destFilePath, true);

        // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created.
        //boolean ret2 = 
        hdfs.createNewFile(destFilePath);

        // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/
        //boolean ret3 = 
        hdfs.delete(destFilePath, true);

        // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv
        //boolean ret4 = 
        hdfs.rename(srcFilePath, destFilePath);

        //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4);
        return;
    }

    // construct the header line
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < clen; i++) {
        sb.append("C" + (i + 1));
        if (i < clen - 1)
            sb.append(_props.getDelim());
    }
    sb.append('\n');

    if (hdfs.isDirectory(srcFilePath)) {

        // compute sorted order among part files
        ArrayList<Path> files = new ArrayList<Path>();
        for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);

        // first part file path
        Path firstpart = files.get(0);

        // create a temp file, and add header and contents of first part
        Path tmp = new Path(firstpart.toString() + ".tmp");
        OutputStream out = hdfs.create(tmp, true);
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy rest of the data from firstpart
        InputStream in = null;
        try {
            in = hdfs.open(firstpart);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }

        // rename tmp to firstpart
        hdfs.delete(firstpart, true);
        hdfs.rename(tmp, firstpart);

        // rename srcfile to destFile
        hdfs.delete(destFilePath, true);
        hdfs.createNewFile(destFilePath); // force the creation of directory structure
        hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure
        hdfs.rename(srcFilePath, destFilePath); // move the data 

    } else if (hdfs.isFile(srcFilePath)) {
        // create destination file
        OutputStream out = hdfs.create(destFilePath, true);

        // write header
        out.write(sb.toString().getBytes());
        sb.setLength(0);

        // copy the data from srcFile
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java

License:Open Source License

public JSONObject parseSpec() throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(_specFile))));
    JSONObject obj = JSONHelper.parse(br);
    br.close();//www .  ja  v  a2 s  .co  m
    return obj;
}

From source file:com.ibm.bi.dml.runtime.transform.BinAgent.java

License:Open Source License

/**
 * Method to load transform metadata for all attributes
 * // ww w. j a  v  a2  s  . c  o m
 * @param job
 * @throws IOException
 */
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
    if (_binList == null)
        return;

    if (fs.isDirectory(txMtdDir)) {
        for (int i = 0; i < _binList.length; i++) {
            int colID = _binList[i];

            Path path = new Path(txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX);
            TfUtils.checkValidInputFile(fs, path, true);

            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
            // format: colID,min,max,nbins
            String[] fields = br.readLine().split(TXMTD_SEP);
            double min = UtilFunctions.parseToDouble(fields[1]);
            //double max = UtilFunctions.parseToDouble(fields[2]);
            double binwidth = UtilFunctions.parseToDouble(fields[3]);
            int nbins = UtilFunctions.parseToInt(fields[4]);

            _numBins[i] = nbins;
            _min[i] = min;
            _binWidths[i] = binwidth; // (max-min)/nbins;

            br.close();
        }
    } else {
        fs.close();
        throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
    }
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Method to read the header line from the input data file.
 * //from w  w w.ja  va 2s . c om
 * @param fs
 * @param prop
 * @param smallestFile
 * @return
 * @throws IOException
 */
private static String readHeaderLine(FileSystem fs, CSVFileFormatProperties prop, String smallestFile)
        throws IOException {
    String line = null;

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(smallestFile))));
    line = br.readLine();
    br.close();
    if (prop.hasHeader()) {
        ; // nothing here
    } else {
        // construct header with default column names, V1, V2, etc.
        int ncol = Pattern.compile(Pattern.quote(prop.getDelim())).split(line, -1).length;
        line = null;

        StringBuilder sb = new StringBuilder();
        sb.append("V1");
        for (int i = 2; i <= ncol; i++)
            sb.append(prop.getDelim() + "V" + i);
        line = sb.toString();
    }
    return line;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Convert input transformation specification file with column names into a
 * specification with corresponding column Ids. This file is sent to all the
 * relevant MR jobs./*from w  w  w. j a v a 2 s  .c  o  m*/
 * 
 * @param fs
 * @param inputPath
 * @param smallestFile
 * @param colNames
 * @param prop
 * @param specFileWithNames
 * @return
 * @throws IllegalArgumentException
 * @throws IOException
 * @throws JSONException 
 */
private static String processSpecFile(FileSystem fs, String inputPath, String smallestFile,
        HashMap<String, Integer> colNames, CSVFileFormatProperties prop, String specFileWithNames)
        throws IllegalArgumentException, IOException, JSONException {
    // load input spec file with Names
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(specFileWithNames))));
    JSONObject inputSpec = JSONHelper.parse(br);
    br.close();

    final String NAME = "name";
    final String ID = "id";
    final String METHOD = "method";
    final String VALUE = "value";
    final String MV_METHOD_MEAN = "global_mean";
    final String MV_METHOD_MODE = "global_mode";
    final String MV_METHOD_CONSTANT = "constant";
    final String BIN_METHOD_WIDTH = "equi-width";
    final String BIN_METHOD_HEIGHT = "equi-height";
    final String SCALE_METHOD_Z = "z-score";
    final String SCALE_METHOD_M = "mean-subtraction";
    final String JSON_BYPOS = "ids";

    String stmp = null;
    JSONObject entry = null;
    byte btmp = 0;

    final int[] mvList;
    int[] rcdList, dcdList, omitList;
    final int[] binList;
    final int[] scaleList;
    byte[] mvMethods = null, binMethods = null, scaleMethods = null;
    Object[] numBins = null;
    Object[] mvConstants = null;

    boolean byPositions = (inputSpec.containsKey(JSON_BYPOS)
            && ((Boolean) inputSpec.get(JSON_BYPOS)).booleanValue() == true);

    // --------------------------------------------------------------------------
    // Omit
    if (inputSpec.containsKey(TX_METHOD.OMIT.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.OMIT.toString());
        omitList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                omitList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                omitList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(omitList);
    } else
        omitList = null;
    // --------------------------------------------------------------------------
    // Missing value imputation
    if (inputSpec.containsKey(TX_METHOD.IMPUTE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.IMPUTE.toString());

        mvList = new int[arrtmp.size()];
        mvMethods = new byte[arrtmp.size()];
        mvConstants = new Object[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);
            if (byPositions) {
                mvList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                mvList[i] = colNames.get(stmp);
            }

            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(MV_METHOD_MEAN))
                btmp = (byte) 1;
            else if (stmp.equals(MV_METHOD_MODE))
                btmp = (byte) 2;
            else if (stmp.equals(MV_METHOD_CONSTANT))
                btmp = (byte) 3;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            mvMethods[i] = btmp;

            //txMethods.add( btmp );

            mvConstants[i] = null;
            if (entry.containsKey(VALUE))
                mvConstants[i] = entry.get(VALUE);
        }

        Integer[] idx = new Integer[mvList.length];
        for (int i = 0; i < mvList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (mvList[o1] - mvList[o2]);
            }
        });

        // rearrange mvList, mvMethods, and mvConstants according to permutation idx
        inplacePermute(mvList, mvMethods, mvConstants, idx);
    } else
        mvList = null;
    // --------------------------------------------------------------------------
    // Recoding
    if (inputSpec.containsKey(TX_METHOD.RECODE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.RECODE.toString());
        rcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                rcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                rcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(rcdList);
    } else
        rcdList = null;
    // --------------------------------------------------------------------------
    // Binning
    if (inputSpec.containsKey(TX_METHOD.BIN.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.BIN.toString());

        binList = new int[arrtmp.size()];
        binMethods = new byte[arrtmp.size()];
        numBins = new Object[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);

            if (byPositions) {
                binList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                binList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(BIN_METHOD_WIDTH))
                btmp = (byte) 1;
            else if (stmp.equals(BIN_METHOD_HEIGHT))
                throw new IOException(
                        "Equi-height binning method is not yet supported, in transformation specification file: "
                                + specFileWithNames);
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            binMethods[i] = btmp;

            numBins[i] = entry.get(TransformationAgent.JSON_NBINS);
            if (((Integer) numBins[i]).intValue() <= 1)
                throw new IllegalArgumentException("Invalid transformation on column \""
                        + (String) entry.get(NAME) + "\". Number of bins must be greater than 1.");
        }

        Integer[] idx = new Integer[binList.length];
        for (int i = 0; i < binList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (binList[o1] - binList[o2]);
            }
        });

        // rearrange binList and binMethods according to permutation idx
        inplacePermute(binList, binMethods, numBins, idx);
    } else
        binList = null;
    // --------------------------------------------------------------------------
    // Dummycoding
    if (inputSpec.containsKey(TX_METHOD.DUMMYCODE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.DUMMYCODE.toString());
        dcdList = new int[arrtmp.size()];
        for (int i = 0; i < arrtmp.size(); i++) {
            if (byPositions)
                dcdList[i] = UtilFunctions.toInt(arrtmp.get(i));
            else {
                stmp = UtilFunctions.unquote((String) arrtmp.get(i));
                dcdList[i] = colNames.get(stmp);
            }
        }
        Arrays.sort(dcdList);
    } else
        dcdList = null;
    // --------------------------------------------------------------------------
    // Scaling
    if (inputSpec.containsKey(TX_METHOD.SCALE.toString())) {
        JSONArray arrtmp = (JSONArray) inputSpec.get(TX_METHOD.SCALE.toString());

        scaleList = new int[arrtmp.size()];
        scaleMethods = new byte[arrtmp.size()];

        for (int i = 0; i < arrtmp.size(); i++) {
            entry = (JSONObject) arrtmp.get(i);

            if (byPositions) {
                scaleList[i] = UtilFunctions.toInt(entry.get(ID));
            } else {
                stmp = UtilFunctions.unquote((String) entry.get(NAME));
                scaleList[i] = colNames.get(stmp);
            }
            stmp = UtilFunctions.unquote((String) entry.get(METHOD));
            if (stmp.equals(SCALE_METHOD_M))
                btmp = (byte) 1;
            else if (stmp.equals(SCALE_METHOD_Z))
                btmp = (byte) 2;
            else
                throw new IOException("Unknown missing value imputation method (" + stmp
                        + ") in transformation specification file: " + specFileWithNames);
            scaleMethods[i] = btmp;
        }

        Integer[] idx = new Integer[scaleList.length];
        for (int i = 0; i < scaleList.length; i++)
            idx[i] = i;
        Arrays.sort(idx, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return (scaleList[o1] - scaleList[o2]);
            }
        });

        // rearrange scaleList and scaleMethods according to permutation idx
        inplacePermute(scaleList, scaleMethods, null, idx);
    } else
        scaleList = null;
    // --------------------------------------------------------------------------

    // check for column IDs that are imputed with mode, but not recoded
    // These columns have be handled separately, because the computation of mode 
    // requires the computation of distinct values (i.e., recode maps)
    ArrayList<Integer> tmpList = new ArrayList<Integer>();
    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];
            if (mvMethods[i] == 2 && (rcdList == null || Arrays.binarySearch(rcdList, colID) < 0))
                tmpList.add(colID);
        }

    int[] mvrcdList = null;
    if (tmpList.size() > 0) {
        mvrcdList = new int[tmpList.size()];
        for (int i = 0; i < tmpList.size(); i++)
            mvrcdList[i] = tmpList.get(i);
    }
    // Perform Validity Checks

    /*
       OMIT MVI RCD BIN DCD SCL
       OMIT     -  x   *   *   *   *
       MVI      x  -   *   *   *   *
       RCD      *  *   -   x   *   x
       BIN      *  *   x   -   *   x
       DCD      *  *   *   *   -   x
       SCL      *  *   x   x   x   -
     */

    if (mvList != null)
        for (int i = 0; i < mvList.length; i++) {
            int colID = mvList[i];

            if (omitList != null && Arrays.binarySearch(omitList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be both omitted and imputed.");

            if (mvMethods[i] == 1) {
                if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                    throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                            + ". A numeric column can not be recoded.");

                if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                    // throw an error only if the column is not binned
                    if (binList == null || Arrays.binarySearch(binList, colID) < 0)
                        throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                                + ". A numeric column can not be dummycoded.");
            }
        }

    if (scaleList != null)
        for (int i = 0; i < scaleList.length; i++) {
            int colID = scaleList[i];
            if (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be recoded and scaled.");
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be binned and scaled.");
            if (dcdList != null && Arrays.binarySearch(dcdList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be dummycoded and scaled.");
        }

    if (rcdList != null)
        for (int i = 0; i < rcdList.length; i++) {
            int colID = rcdList[i];
            if (binList != null && Arrays.binarySearch(binList, colID) >= 0)
                throw new IllegalArgumentException("Invalid transformations on column ID " + colID
                        + ". A column can not be recoded and binned.");
        }

    // Check if dummycoded columns are either recoded or binned.
    // If not, add them to recode list.
    ArrayList<Integer> addToRcd = new ArrayList<Integer>();
    if (dcdList != null)
        for (int i = 0; i < dcdList.length; i++) {
            int colID = dcdList[i];
            boolean isRecoded = (rcdList != null && Arrays.binarySearch(rcdList, colID) >= 0);
            boolean isBinned = (binList != null && Arrays.binarySearch(binList, colID) >= 0);
            // If colID is neither recoded nor binned, then, add it to rcdList.
            if (!isRecoded && !isBinned)
                addToRcd.add(colID);
        }
    if (addToRcd.size() > 0) {
        int[] newRcdList = null;
        if (rcdList != null)
            newRcdList = Arrays.copyOf(rcdList, rcdList.length + addToRcd.size());
        else
            newRcdList = new int[addToRcd.size()];

        int i = (rcdList != null ? rcdList.length : 0);
        for (int idx = 0; i < newRcdList.length; i++, idx++)
            newRcdList[i] = addToRcd.get(idx);
        Arrays.sort(newRcdList);
        rcdList = newRcdList;
    }
    // -----------------------------------------------------------------------------

    // Prepare output spec
    JSONObject outputSpec = new JSONObject();

    if (omitList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(omitList));
        outputSpec.put(TX_METHOD.OMIT.toString(), rcdSpec);
    }

    if (mvList != null) {
        JSONObject mvSpec = new JSONObject();
        mvSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvList));
        mvSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(mvMethods));
        mvSpec.put(TransformationAgent.JSON_CONSTS, toJSONArray(mvConstants));
        outputSpec.put(TX_METHOD.IMPUTE.toString(), mvSpec);
    }

    if (rcdList != null) {
        JSONObject rcdSpec = new JSONObject();
        rcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(rcdList));
        outputSpec.put(TX_METHOD.RECODE.toString(), rcdSpec);
    }

    if (binList != null) {
        JSONObject binSpec = new JSONObject();
        binSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(binList));
        binSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(binMethods));
        binSpec.put(TransformationAgent.JSON_NBINS, toJSONArray(numBins));
        outputSpec.put(TX_METHOD.BIN.toString(), binSpec);
    }

    if (dcdList != null) {
        JSONObject dcdSpec = new JSONObject();
        dcdSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(dcdList));
        outputSpec.put(TX_METHOD.DUMMYCODE.toString(), dcdSpec);
    }

    if (scaleList != null) {
        JSONObject scaleSpec = new JSONObject();
        scaleSpec.put(TransformationAgent.JSON_ATTRS, toJSONArray(scaleList));
        scaleSpec.put(TransformationAgent.JSON_MTHD, toJSONArray(scaleMethods));
        outputSpec.put(TX_METHOD.SCALE.toString(), scaleSpec);
    }

    if (mvrcdList != null) {
        JSONObject mvrcd = new JSONObject();
        mvrcd.put(TransformationAgent.JSON_ATTRS, toJSONArray(mvrcdList));
        outputSpec.put(TX_METHOD.MVRCD.toString(), mvrcd);
    }

    // write out the spec with IDs
    String specFileWithIDs = MRJobConfiguration.constructTempOutputFilename();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(specFileWithIDs), true)));
    out.write(outputSpec.toString());
    out.close();

    return specFileWithIDs;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Helper function to determine the number of columns after applying
 * transformations. Note that dummycoding changes the number of columns.
 * /*from  www . ja v  a2s  . co  m*/
 * @param fs
 * @param header
 * @param delim
 * @param tfMtdPath
 * @return
 * @throws IllegalArgumentException
 * @throws IOException
 * @throws DMLRuntimeException
 * @throws JSONException 
 */
private static int getNumColumnsTf(FileSystem fs, String header, String delim, String tfMtdPath)
        throws IllegalArgumentException, IOException, DMLRuntimeException, JSONException {
    String[] columnNames = Pattern.compile(Pattern.quote(delim)).split(header, -1);
    int ret = columnNames.length;

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(tfMtdPath + "/spec.json"))));
    JSONObject spec = JSONHelper.parse(br);
    br.close();

    // fetch relevant attribute lists
    if (!spec.containsKey(TX_METHOD.DUMMYCODE.toString()))
        return ret;

    JSONArray dcdList = (JSONArray) ((JSONObject) spec.get(TX_METHOD.DUMMYCODE.toString()))
            .get(TransformationAgent.JSON_ATTRS);

    // look for numBins among binned columns
    for (Object o : dcdList) {
        int id = UtilFunctions.toInt(o);

        Path binpath = new Path(tfMtdPath + "/Bin/" + UtilFunctions.unquote(columnNames[id - 1])
                + TransformationAgent.BIN_FILE_SUFFIX);
        Path rcdpath = new Path(tfMtdPath + "/Recode/" + UtilFunctions.unquote(columnNames[id - 1])
                + TransformationAgent.NDISTINCT_FILE_SUFFIX);

        if (TfUtils.checkValidInputFile(fs, binpath, false)) {
            br = new BufferedReader(new InputStreamReader(fs.open(binpath)));
            int nbins = UtilFunctions.parseToInt(br.readLine().split(TransformationAgent.TXMTD_SEP)[4]);
            br.close();
            ret += (nbins - 1);
        } else if (TfUtils.checkValidInputFile(fs, rcdpath, false)) {
            br = new BufferedReader(new InputStreamReader(fs.open(rcdpath)));
            int ndistinct = UtilFunctions.parseToInt(br.readLine());
            br.close();
            ret += (ndistinct - 1);
        } else
            throw new DMLRuntimeException("Relevant transformation metadata for column (id=" + id + ", name="
                    + columnNames[id - 1] + ") is not found.");
    }
    //System.out.println("Number of columns in transformed data: " + ret);
    return ret;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

private static String getOutputHeader(FileSystem fs, String headerLine, TransformOperands oprnds)
        throws IOException {
    String ret = null;//w  w w  .  java  2  s  .c  o  m

    if (oprnds.isApply) {
        BufferedReader br = new BufferedReader(new InputStreamReader(
                fs.open(new Path(oprnds.applyTxPath + "/" + TransformationAgent.OUT_HEADER))));
        ret = br.readLine();
        br.close();
    } else {
        if (oprnds.outNamesFile == null)
            ret = headerLine;
        else {
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(fs.open(new Path(oprnds.outNamesFile))));
            ret = br.readLine();
            br.close();
        }
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

private static int[] countNumRows(ArrayList<Path> files, CSVFileFormatProperties prop, FileSystem fs,
        TfUtils agents) throws IOException {
    int[] rows = new int[2];
    int numRows = 0, numRowsTf = 0;

    OmitAgent oa = agents.getOmitAgent();

    if (!oa.isApplicable()) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0 && prop.hasHeader())
                br.readLine(); //ignore header

            while (br.readLine() != null)
                numRows++;/*from   www .  ja  v a 2 s  . c  o  m*/
            br.close();
        }
        numRowsTf = numRows;
    } else {
        String line = null;
        String[] words;

        Pattern delim = Pattern.compile(Pattern.quote(prop.getDelim()));

        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0 && prop.hasHeader())
                br.readLine(); //ignore header

            while ((line = br.readLine()) != null) {
                numRows++;

                words = delim.split(line, -1);
                if (!oa.omit(words, agents))
                    numRowsTf++;
            }
            br.close();
        }
    }

    rows[0] = numRows;
    rows[1] = numRowsTf;

    return rows;
}

From source file:com.ibm.bi.dml.runtime.transform.DataTransform.java

License:Open Source License

/**
 * Main method to create and/or apply transformation metdata in-memory, on a single node.
 * /*from   w  w w  . j a  v  a 2  s.c  om*/
 * @param job
 * @param fs
 * @param inputPath
 * @param ncols
 * @param prop
 * @param specFileWithIDs
 * @param tfMtdPath
 * @param applyTxPath
 * @param isApply
 * @param outputPath
 * @param headerLine
 * @throws IOException
 * @throws DMLRuntimeException 
 * @throws JSONException 
 * @throws IllegalArgumentException 
 */
private static JobReturn performTransform(JobConf job, FileSystem fs, String inputPath, int ncols,
        CSVFileFormatProperties prop, String specFileWithIDs, String tfMtdPath, boolean isApply,
        MatrixObject result, String headerLine, boolean isBB, boolean isCSV)
        throws IOException, DMLRuntimeException, IllegalArgumentException, JSONException {

    String[] na = TfUtils.parseNAStrings(prop.getNAStrings());

    JSONObject spec = TfUtils.readSpec(fs, specFileWithIDs);
    TfUtils agents = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), na, spec, ncols, tfMtdPath,
            null, null);

    MVImputeAgent _mia = agents.getMVImputeAgent();
    RecodeAgent _ra = agents.getRecodeAgent();
    BinAgent _ba = agents.getBinAgent();
    DummycodeAgent _da = agents.getDummycodeAgent();

    // List of files to read
    ArrayList<Path> files = collectInputFiles(inputPath, fs);

    // ---------------------------------
    // Construct transformation metadata
    // ---------------------------------

    String line = null;
    String[] words = null;

    int numColumnsTf = 0;
    BufferedReader br = null;

    if (!isApply) {
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0 && prop.hasHeader())
                br.readLine(); //ignore header

            line = null;
            while ((line = br.readLine()) != null) {
                agents.prepareTfMtd(line);
            }
            br.close();
        }

        if (agents.getValid() == 0)
            throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);

        _mia.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ba.outputTransformationMetadata(tfMtdPath, fs, agents);
        _ra.outputTransformationMetadata(tfMtdPath, fs, agents);

        // prepare agents for the subsequent phase of applying transformation metadata

        // NO need to loadTxMtd for _ra, since the maps are already present in the memory
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);

        _da.setRecodeMapsCP(_ra.getCPRecodeMaps());
        _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    } else {
        // Count the number of rows
        int rows[] = countNumRows(files, prop, fs, agents);
        agents.setTotal(rows[0]);
        agents.setValid(rows[1]);

        if (agents.getValid() == 0)
            throw new DMLRuntimeException(
                    "Number of rows in the transformed output (potentially, after ommitting the ones with missing values) is zero. Cannot proceed.");

        // Load transformation metadata
        // prepare agents for the subsequent phase of applying transformation metadata
        Path tmp = new Path(tfMtdPath);
        _mia.loadTxMtd(job, fs, tmp, agents);
        _ra.loadTxMtd(job, fs, tmp, agents);
        _ba.loadTxMtd(job, fs, tmp, agents);

        _da.setRecodeMaps(_ra.getRecodeMaps());
        _da.setNumBins(_ba.getBinList(), _ba.getNumBins());
        _da.loadTxMtd(job, fs, tmp, agents);
    }

    // -----------------------------
    // Apply transformation metadata
    // -----------------------------

    numColumnsTf = getNumColumnsTf(fs, headerLine, prop.getDelim(), tfMtdPath);

    MapReduceTool.deleteFileIfExistOnHDFS(result.getFileName());
    BufferedWriter out = new BufferedWriter(
            new OutputStreamWriter(fs.create(new Path(result.getFileName()), true)));
    StringBuilder sb = new StringBuilder();

    MatrixBlock mb = null;
    if (isBB) {
        int estNNZ = (int) agents.getValid() * ncols;
        mb = new MatrixBlock((int) agents.getValid(), numColumnsTf, estNNZ);

        if (mb.isInSparseFormat())
            mb.allocateSparseRowsBlock();
        else
            mb.allocateDenseBlock();
    }

    int rowID = 0; // rowid to be used in filling the matrix block

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0) {
            String header = null;
            if (prop.hasHeader())
                br.readLine(); // ignore the header line from data file

            header = headerLine;
            String dcdHeader = _da.constructDummycodedHeader(header, agents.getDelim());
            numColumnsTf = _da.genDcdMapsAndColTypes(fs, tfMtdPath, ncols, agents);
            DataTransform.generateHeaderFiles(fs, tfMtdPath, header, dcdHeader);
        }

        line = null;
        while ((line = br.readLine()) != null) {
            words = agents.getWords(line);

            if (!agents.omit(words)) {
                words = agents.apply(words, !isApply);

                if (isCSV) {
                    out.write(agents.checkAndPrepOutputString(words, sb));
                    out.write("\n");
                }

                if (isBB) {
                    agents.check(words);
                    for (int c = 0; c < words.length; c++) {
                        if (words[c] == null || words[c].isEmpty())
                            ;
                        else
                            mb.appendValue(rowID, c, UtilFunctions.parseToDouble(words[c]));
                    }
                }
                rowID++;
            }
        }
        br.close();
    }
    out.close();

    if (mb != null) {
        mb.recomputeNonZeros();
        mb.examSparsity();

        result.acquireModify(mb);
        result.release();
        result.exportData();
    }

    MatrixCharacteristics mc = new MatrixCharacteristics(agents.getValid(), numColumnsTf,
            (int) result.getNumRowsPerBlock(), (int) result.getNumColumnsPerBlock());
    JobReturn ret = new JobReturn(new MatrixCharacteristics[] { mc }, true);

    return ret;
}

From source file:com.ibm.bi.dml.runtime.transform.MVImputeAgent.java

License:Open Source License

private String readReplacement(int colID, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
    Path path = new Path(txMtdDir + "/Impute/" + agents.getName(colID) + MV_FILE_SUFFIX);
    TfUtils.checkValidInputFile(fs, path, true);

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();//from w w w. j a  v  a2  s . co  m
    String replacement = UtilFunctions.unquote(line.split(TXMTD_SEP)[1]);
    br.close();

    return replacement;
}