Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.springframework.yarn.batch.partition.MultiHdfsResourcePartitioner.java

License:Apache License

/**
 * Assign the filename of each of the injected resources to an
 * {@link ExecutionContext}.//from   w  ww. j  a v a  2 s  .c om
 *
 * @see Partitioner#partition(int)
 */
public Map<String, ExecutionContext> partition(int gridSize) {
    Map<String, ExecutionContext> contexts = new HashMap<String, ExecutionContext>();

    try {
        FileSystem fs = FileSystem.get(configuration);
        int i = 0;
        for (Resource resource : resources) {
            Assert.state(resource.exists(), "Resource does not exist: " + resource);
            Path path = new Path(resource.getURL().getPath());

            FileStatus[] fileStatuses = fs.globStatus(path);
            if (fileStatuses == null || fileStatuses.length != 1) {
                throw new IllegalArgumentException("Error getting file status for resource=" + resource);
            }

            if (splitFile) {
                // get split size and estimate split blocks
                long blockSize = fileStatuses[0].getBlockSize();
                long[] positions = getSplitPositions(blockSize, fileStatuses[0].getLen(), splitSize,
                        forceSplit);
                long position = 0;
                for (int j = 0; j < positions.length; j++) {
                    contexts.put(PARTITION_KEY + i++,
                            createExecutionContext(resource, position, positions[j] - position - 1));
                    position = positions[j];
                }
                contexts.put(PARTITION_KEY + i++,
                        createExecutionContext(resource, position, fileStatuses[0].getLen() - position));
            } else {
                // just add file
                contexts.put(PARTITION_KEY + i++,
                        createExecutionContext(resource, 0, fileStatuses[0].getLen()));
            }
        }
    } catch (IOException e) {
        throw new IllegalArgumentException("Error partitioning splits", e);
    }

    return contexts;
}

From source file:org.springframework.yarn.fs.DefaultResourceLocalizer.java

License:Apache License

/**
 * Gets a map of localized resources./*from   w w  w.  j  a v  a2 s .  c  o  m*/
 *
 * @param fs the file system
 * @return a map of localized resources
 * @throws IOException if problem occurred getting file status
 * @throws URISyntaxException if file path is wrong
 */
protected Map<String, LocalResource> doFileTransfer(FileSystem fs) throws IOException, URISyntaxException {
    Map<String, LocalResource> returned = new HashMap<String, LocalResource>();
    Path resolvedStagingDirectory = resolveStagingDirectory();
    for (TransferEntry e : transferEntries) {
        Path remotePath = (!e.staging) ? new Path(e.remote + e.path)
                : new Path(e.remote + resolvedStagingDirectory.toUri().getPath() + e.path);
        URI localUri = new URI(e.local);
        FileStatus[] fileStatuses = fs.globStatus(remotePath);
        if (log.isDebugEnabled()) {
            log.debug("Trying path " + remotePath + " glob fileStatus length="
                    + (fileStatuses != null ? fileStatuses.length : "null"));
        }
        if (!ObjectUtils.isEmpty(fileStatuses)) {
            for (FileStatus status : fileStatuses) {
                if (log.isDebugEnabled()) {
                    log.debug("FileStatus=" + status);
                }
                if (status.isFile()) {
                    URI remoteUri = status.getPath().toUri();
                    Path path = new Path(new Path(localUri), remoteUri.getPath());
                    LocalResource res = Records.newRecord(LocalResource.class);
                    res.setType(e.type);
                    res.setVisibility(e.visibility);
                    res.setResource(ConverterUtils.getYarnUrlFromPath(path));
                    res.setTimestamp(status.getModificationTime());
                    res.setSize(status.getLen());
                    if (log.isDebugEnabled()) {
                        log.debug("Using remote uri [" + remoteUri + "] and local uri [" + localUri
                                + "] converted to path [" + path + "]");
                    }
                    returned.put(status.getPath().getName(), res);
                }
            }
        }
    }
    return returned;
}

From source file:org.trafodion.sql.HBaseAccess.HBaseClient.java

License:Apache License

public boolean estimateRowCount(String tblName, int partialRowSize, int numCols, long[] rc)
        throws MasterNotRunningException, IOException, ClassNotFoundException, URISyntaxException {
    if (logger.isDebugEnabled())
        logger.debug("HBaseClient.estimateRowCount(" + tblName + ") called.");

    final String REGION_NAME_PATTERN = "[0-9a-f]*";
    final String HFILE_NAME_PATTERN = "[0-9a-f]*";

    // To estimate incidence of nulls, read the first 500 rows worth
    // of KeyValues.
    final int ROWS_TO_SAMPLE = 500;
    int putKVsSampled = 0;
    int nonPutKVsSampled = 0;
    int nullCount = 0;
    long totalEntries = 0; // KeyValues in all HFiles for table
    long totalSizeBytes = 0; // Size of all HFiles for table 
    long estimatedTotalPuts = 0;
    boolean more = true;

    // Access the file system to go directly to the table's HFiles.
    // Create a reader for the file to access the entry count stored
    // in the trailer block, and a scanner to iterate over a few
    // hundred KeyValues to estimate the incidence of nulls.
    long nano1, nano2;
    nano1 = System.nanoTime();//from w  w  w  .  ja v  a 2  s . co  m
    FileSystem fileSystem = FileSystem.get(config);
    nano2 = System.nanoTime();
    if (logger.isDebugEnabled())
        logger.debug("FileSystem.get() took " + ((nano2 - nano1) + 500000) / 1000000 + " milliseconds.");
    CacheConfig cacheConf = new CacheConfig(config);
    String hbaseRootPath = config.get(HConstants.HBASE_DIR).trim();
    if (hbaseRootPath.charAt(0) != '/')
        hbaseRootPath = new URI(hbaseRootPath).getPath();
    if (logger.isDebugEnabled())
        logger.debug("hbaseRootPath = " + hbaseRootPath);
    FileStatus[] fsArr = fileSystem.globStatus(new Path(hbaseRootPath + "/data/default/" + tblName + "/"
            + REGION_NAME_PATTERN + "/#1/" + HFILE_NAME_PATTERN));
    for (FileStatus fs : fsArr) {
        // Make sure the file name conforms to HFile name pattern.
        if (!StoreFileInfo.isHFile(fs.getPath())) {
            if (logger.isDebugEnabled())
                logger.debug("Skipped file " + fs.getPath() + " -- not a valid HFile name.");
            continue;
        }
        HFile.Reader reader = HFile.createReader(fileSystem, fs.getPath(), cacheConf, config);
        try {
            totalEntries += reader.getEntries();
            totalSizeBytes += reader.length();
            //printQualifiers(reader, 100);
            if (ROWS_TO_SAMPLE > 0 && totalEntries == reader.getEntries()) { // first file only
                // Trafodion column qualifiers are ordinal numbers, which
                // makes it easy to count missing (null) values. We also count
                // the non-Put KVs (typically delete-row markers) to estimate
                // their frequency in the full file set.
                HFileScanner scanner = reader.getScanner(false, false, false);
                scanner.seekTo(); //position at beginning of first data block
                byte currQual = 0;
                byte nextQual;
                do {
                    KeyValue kv = scanner.getKeyValue();
                    if (kv.getType() == KeyValue.Type.Put.getCode()) {
                        nextQual = kv.getQualifier()[0];
                        if (nextQual <= currQual)
                            nullCount += ((numCols - currQual) // nulls at end of this row
                                    + (nextQual - 1)); // nulls at start of next row
                        else
                            nullCount += (nextQual - currQual - 1);
                        currQual = nextQual;
                        putKVsSampled++;
                    } else {
                        nonPutKVsSampled++; // don't count these toward the number
                    } //   we want to scan
                } while ((putKVsSampled + nullCount) < (numCols * ROWS_TO_SAMPLE) && (more = scanner.next()));

                // If all rows were read, count any nulls at end of last row.
                if (!more && putKVsSampled > 0)
                    nullCount += (numCols - currQual);

                if (logger.isDebugEnabled())
                    logger.debug("Sampled " + nullCount + " nulls.");
            } // code for first file
        } finally {
            reader.close(false);
        }
    } // for

    long estimatedEntries = (ROWS_TO_SAMPLE > 0 ? 0 // get from sample data, below
            : totalEntries); // no sampling, use stored value
    if (putKVsSampled > 0) // avoid div by 0 if no Put KVs in sample
    {
        estimatedTotalPuts = (putKVsSampled * totalEntries) / (putKVsSampled + nonPutKVsSampled);
        estimatedEntries = ((putKVsSampled + nullCount) * estimatedTotalPuts) / putKVsSampled;
    }

    // Calculate estimate of rows in all HFiles of table.
    rc[0] = (estimatedEntries + (numCols / 2)) / numCols; // round instead of truncate

    // Estimate # of rows in MemStores of all regions of table. Pass
    // a value to divide the size of the MemStore by. Base this on the
    // ratio of bytes-to-rows in the HFiles, or the actual row size if
    // the HFiles were empty.
    int rowSize;
    if (rc[0] > 0)
        rowSize = (int) (totalSizeBytes / rc[0]);
    else {
        // From Traf metadata we have calculated and passed in part of the row
        // size, including size of column qualifiers (col names), which are not
        // known to HBase.  Add to this the length of the fixed part of the
        // KeyValue format, times the number of columns.
        int fixedSizePartOfKV = KeyValue.KEYVALUE_INFRASTRUCTURE_SIZE // key len + value len
                + KeyValue.KEY_INFRASTRUCTURE_SIZE; // rowkey & col family len, timestamp, key type
        rowSize = partialRowSize // for all cols: row key + col qualifiers + values
                + (fixedSizePartOfKV * numCols);

        // Trafodion tables have a single col family at present, so we only look
        // at the first family name, and multiply its length times the number of
        // columns. Even if more than one family is used in the future, presumably
        // they will all be the same short size.
        HTable htbl = new HTable(config, tblName);
        HTableDescriptor htblDesc = htbl.getTableDescriptor();
        HColumnDescriptor[] families = htblDesc.getColumnFamilies();
        rowSize += (families[0].getName().length * numCols);
    }

    // Get the estimate of MemStore rows. Add to total after logging
    // of individual sums below.
    long memStoreRows = estimateMemStoreRows(tblName, rowSize);

    if (logger.isDebugEnabled())
        logger.debug(tblName + " contains a total of " + totalEntries + " KeyValues in all HFiles.");
    if (logger.isDebugEnabled())
        logger.debug("Based on a sample, it is estimated that " + estimatedTotalPuts
                + " of these KeyValues are of type Put.");
    if (putKVsSampled + nullCount > 0)
        if (logger.isDebugEnabled())
            logger.debug("Sampling indicates a null incidence of "
                    + (nullCount * 100) / (putKVsSampled + nullCount) + " percent.");
    if (logger.isDebugEnabled())
        logger.debug("Estimated number of actual values (including nulls) is " + estimatedEntries);
    if (logger.isDebugEnabled())
        logger.debug("Estimated row count in HFiles = " + estimatedEntries + " / " + numCols + " (# columns) = "
                + rc[0]);
    if (logger.isDebugEnabled())
        logger.debug("Estimated row count from MemStores = " + memStoreRows);

    rc[0] += memStoreRows; // Add memstore estimate to total
    if (logger.isDebugEnabled())
        logger.debug("Total estimated row count for " + tblName + " = " + rc[0]);
    return true;
}

From source file:org.trafodion.sql.HBaseAccess.HBaseClient.java

License:Apache License

/**
This method returns index levels and block size of Hbase Table.
Index level is read from  Hfiles trailer block. Randomly selects one region and iterates through all Hfiles
in the chosen region and gets the maximum index level.
Block size is read from HColumnDescriptor.
**///from w w w  .j a  v a2s  .  com
public boolean getHbaseTableInfo(String tblName, int[] tblInfo)
        throws MasterNotRunningException, IOException, ClassNotFoundException, URISyntaxException {

    if (logger.isDebugEnabled())
        logger.debug("HBaseClient.getHbaseTableInfo(" + tblName + ") called.");
    final String REGION_NAME_PATTERN = "[0-9a-f]*";
    final String HFILE_NAME_PATTERN = "[0-9a-f]*";

    // initialize 
    int indexLevel = 0;
    int currIndLevel = 0;
    int blockSize = 0;
    tblInfo[0] = indexLevel;
    tblInfo[1] = blockSize;

    // get block size
    HTable htbl = new HTable(config, tblName);
    HTableDescriptor htblDesc = htbl.getTableDescriptor();
    HColumnDescriptor[] families = htblDesc.getColumnFamilies();
    blockSize = families[0].getBlocksize();
    tblInfo[1] = blockSize;

    // Access the file system to go directly to the table's HFiles.
    long nano1 = 0, nano2 = 0;
    if (logger.isDebugEnabled())
        nano1 = System.nanoTime();
    FileSystem fileSystem = FileSystem.get(config);

    if (logger.isDebugEnabled()) {
        nano2 = System.nanoTime();
        logger.debug("FileSystem.get() took " + ((nano2 - nano1) + 500000) / 1000000 + " milliseconds.");
    }
    CacheConfig cacheConf = new CacheConfig(config);
    String hbaseRootPath = config.get(HConstants.HBASE_DIR).trim();
    if (hbaseRootPath.charAt(0) != '/')
        hbaseRootPath = new URI(hbaseRootPath).getPath();
    if (logger.isDebugEnabled())
        logger.debug("hbaseRootPath = " + hbaseRootPath);

    String regDir = hbaseRootPath + "/data/default/" + tblName + "/" + REGION_NAME_PATTERN + "/#1";
    if (logger.isDebugEnabled())
        logger.debug("region dir = " + regDir);

    //get random region from the list of regions and look at all Hfiles in that region
    FileStatus[] regArr;
    try {
        regArr = fileSystem.globStatus(new Path(regDir));
    } catch (IOException ioe) {
        if (logger.isDebugEnabled())
            logger.debug("fs.globStatus on region throws IOException");
        return false; // return index level = 0; and  block size
    }

    // logging
    if (logger.isDebugEnabled()) {
        for (int i = 0; i < regArr.length; i++)
            logger.debug("Region Path is " + regArr[i].getPath());
    }
    // get random region from the region array
    int regInd = 0;
    regInd = tblName.hashCode() % regArr.length;

    Path regName = regArr[regInd].getPath();
    // extract MD5 hash name of random region from its path including colFam name. 
    // we just need part2 and looks something like /c8fe2d575de62d5d5ffc530bda497bca/#1
    String strRegPath = regName.toString();
    String parts[] = strRegPath.split(tblName);
    String part2 = parts[1];

    // now remove regular expression from the region path.
    // would look something like /hbase/data/default/<cat.sch.tab>/[0-9a-f]*/#1
    int j = regDir.indexOf("/[");
    String regPrefix = regDir.substring(0, j);
    if (logger.isDebugEnabled())
        logger.debug("Region Path prefix = " + regPrefix);
    String hfilePath = regPrefix + part2 + "/" + HFILE_NAME_PATTERN;

    if (logger.isDebugEnabled())
        logger.debug("Random = " + regInd + ", region is " + regName);
    if (logger.isDebugEnabled())
        logger.debug("Hfile path = " + hfilePath);

    FileStatus[] fsArr;
    try {
        fsArr = fileSystem.globStatus(new Path(hfilePath));
    } catch (IOException ioe) {
        if (logger.isDebugEnabled())
            logger.debug("fs.globStatus on Hfile throws IOException");
        return false; // return index level = 0; and  block size
    }

    if (logger.isDebugEnabled()) {
        for (int i = 0; i < fsArr.length; i++)
            logger.debug("Hfile Path is " + fsArr[i].getPath());
    }

    // no Hfiles return from here
    if (fsArr.length == 0)
        return true; // return index level = 0; and  block size

    // get maximum index level going through all Hfiles of randomly chosen region
    if (logger.isDebugEnabled())
        nano1 = System.nanoTime();
    for (FileStatus fs : fsArr) {
        // Make sure the file name conforms to HFile name pattern.
        if (!StoreFileInfo.isHFile(fs.getPath())) {
            if (logger.isDebugEnabled())
                logger.debug("Skipped file " + fs.getPath() + " -- not a valid HFile name.");
            continue;
        }

        // Create a reader for the file to access the index levels stored
        // in the trailer block
        HFile.Reader reader = HFile.createReader(fileSystem, fs.getPath(), cacheConf, config);
        try {
            FixedFileTrailer trailer = reader.getTrailer();
            currIndLevel = trailer.getNumDataIndexLevels();
            // index levels also include data block, should be excluded.
            if (currIndLevel > 0)
                currIndLevel = currIndLevel - 1;
            if (logger.isDebugEnabled())
                logger.debug("currIndLevel = " + currIndLevel + ", indexLevel = " + indexLevel);
            if (currIndLevel > indexLevel)
                indexLevel = currIndLevel;
        } finally {
            reader.close(false);
        }
    } // for

    if (logger.isDebugEnabled()) {
        nano2 = System.nanoTime();
        logger.debug("get index level took " + ((nano2 - nano1) + 500000) / 1000000 + " milliseconds.");
    }

    tblInfo[0] = indexLevel;
    if (logger.isDebugEnabled()) {
        logger.debug("Index Levels for " + tblName + " = " + tblInfo[0]);
        logger.debug("Block Size for " + tblName + " = " + tblInfo[1]);
    }

    return true;
}

From source file:org.wso2.carbon.hdfs.mgt.HDFSAdmin.java

License:Open Source License

/**
 * Copy  a given path to another path.//ww  w . ja  va 2  s.c om
 * @param srcPath the src path to copy.
 * @param dstPath the destination to copy to.
 * @throws HDFSServerManagementException
 */
public void copy(String srcPath, String dstPath) throws HDFSServerManagementException {

    FileSystem hdfsFS = null;
    try {
        hdfsFS = hdfsAdminHelperInstance.getFSforUser();
    } catch (IOException e) {
        String msg = "Error occurred while mouting the file system";
        handleException(msg, e);
    }

    Path[] srcs = new Path[0];
    if (hdfsFS != null) {
        try {
            srcs = FileUtil.stat2Paths(hdfsFS.globStatus(new Path(srcPath)), new Path(srcPath));
        } catch (IOException e) {
            String msg = "Error occurred while trying to copy file.";
            handleException(msg, e);
        }
    }
    try {
        if (srcs.length > 1 && !hdfsFS.getFileStatus(new Path(dstPath)).isDir()) {
            throw new IOException("When copying multiple files, " + "destination should be a directory.");
        }
    } catch (IOException e) {
        String msg = "Error occurred while trying to copy file.";
        handleException(msg, e);
    }
    Configuration configuration = new Configuration();
    configuration.set("io.file.buffer.size", Integer.toString(4096));
    for (int i = 0; i < srcs.length; i++) {
        try {
            FileUtil.copy(hdfsFS, srcs[i], hdfsFS, new Path(dstPath), false, configuration);
        } catch (IOException e) {
            String msg = "Error occurred while trying to copy file.";
            handleException(msg, e);
        }
    }
}

From source file:parquet.hadoop.ParquetMultiOutputCommitter.java

License:Apache License

public void commitJob(JobContext jobContext) throws IOException {
    super.commitJob(jobContext);
    Configuration configuration = ContextUtil.getConfiguration(jobContext);
    if (configuration.getBoolean(ParquetMultiOutputFormat.ENABLE_JOB_SUMMARY, true)) {
        try {//  w ww.j  ava 2  s . co  m
            final FileSystem fileSystem = outputPath.getFileSystem(configuration);
            FileStatus[] statuses = fileSystem.globStatus(new Path(outputPath, new Path("*\\.parquet")));

            for (FileStatus outputStatus : statuses) {
                List<Footer> footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus);
                try {
                    ParquetFileWriter.writeMetadataFile(configuration, outputStatus.getPath(), footers);
                } catch (Exception e) {
                    LOG.warn("could not write summary file for " + outputStatus.getPath(), e);
                    final Path metadataPath = new Path(outputStatus.getPath(),
                            ParquetFileWriter.PARQUET_METADATA_FILE);
                    if (fileSystem.exists(metadataPath)) {
                        fileSystem.delete(metadataPath, true);
                    }
                }
            }
        } catch (Exception e) {
            LOG.warn("could not write summary file for " + outputPath, e);
        }
    }
}

From source file:ruciotools.Grep.java

License:Apache License

private static void assignInputFiles(FileSystem fs, Map<String, Object> settings, Job job)
        throws ParseException, IOException, Grep.NoInputFilesFound {
    // Extend date range and type to iderive explicite set of input files 
    List<Date> dates = new ArrayList<Date>();
    Calendar cal = Calendar.getInstance();
    Boolean excludeTmpFiles = (settings.get("excludeTmpFiles") != null);

    cal.setTime((Date) settings.get("fromDate"));
    while (!cal.getTime().after((Date) settings.get("toDate"))) {
        dates.add(cal.getTime());//w w  w  . ja v a 2 s . c  om
        cal.add(Calendar.DATE, 1);
    }

    for (int i = 0; i < dates.size(); i++) {
        for (String type : (ArrayList<String>) settings.get("types")) {
            Path p = new Path("/user/rucio01/logs/" + type + "/*" + date_format.format(dates.get(i)) + "*");
            for (FileStatus file : fs.globStatus(p)) {
                if ((excludeTmpFiles) && (file.getPath().toString().endsWith("tmp"))) {
                    ((List<String>) settings.get("excludeTmpFiles")).add(file.getPath().getName().toString());
                    continue;
                }
                FileInputFormat.addInputPath(job, file.getPath());
            }
        }
    }
    if (FileInputFormat.getInputPaths(job).length == 0) {
        throw new Grep.NoInputFilesFound("For type " + settings.get("types") + " from "
                + date_format.format(dates.get(0)) + " to " + date_format.format(dates.get(dates.size() - 1))
                + " no log files coiuld be found on HDFS.");
    }
}

From source file:si.david.mapreduce.lda.InternalVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**//from  w  w  w. j a  v a 2  s  .  c o  m
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        switch (dictionaryType) {
        case "text":
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
            break;
        case "sequencefile":
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
            break;
        default:
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters == null || !(vector instanceof NamedVector)
                        || filters.contains(((NamedVector) vector).getName())) {
                    if (sizeOnly) {
                        if (vector instanceof NamedVector) {
                            writer.write(((NamedVector) vector).getName());
                            writer.write(":");
                        } else {
                            writer.write(String.valueOf(i++));
                            writer.write(":");
                        }
                        writer.write(String.valueOf(vector.size()));
                        writer.write('\n');
                    } else if (nameOnly) {
                        if (vector instanceof NamedVector) {
                            writer.write(((NamedVector) vector).getName());
                            writer.write('\n');
                        }
                    } else {
                        String fmtStr;
                        if (useCSV) {
                            fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                        } else {
                            fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                    sortVectors);
                        }
                        writer.write(fmtStr);
                        writer.write('\n');
                    }
                    itemCount++;
                }
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:simsql.code_generator.MyPhysicalDatabase.java

License:Apache License

public void backupTo(String toHere) {

    ConsoleReader cr = null;/*from   w ww. j a  v  a 2  s . co m*/
    try {
        cr = new ConsoleReader();
    } catch (Exception e) {
        throw new RuntimeException("Could not create a console for reading!", e);
    }

    try {

        // first, we see if the directory that we are writing to exists
        Configuration conf = new Configuration();
        FileSystem dfs = FileSystem.get(conf);

        Path pathTo = new Path(toHere);
        if (dfs.exists(pathTo)) {
            System.out.format("The specified path already exists. Would you like to overwrite it? [Y/N] ");

            char answer = (char) cr.readCharacter(new char[] { 'Y', 'N', 'y', 'n' });
            System.out.println(answer);

            if (answer == 'Y' || answer == 'y') {
                dfs.delete(pathTo, true);
            } else {

                // otherwise, we don't proceed.
                pathTo = null;
            }
        }

        // do we continue?
        if (pathTo != null) {

            // make it a directory
            // dfs.mkdirs(pathTo);

            // get our current directory.
            Path pathFrom = new Path(myDir);

            // and all the paths of the in our directory
            Path[] sourcePaths = FileUtil.stat2Paths(dfs.globStatus(pathFrom), pathFrom);
            for (Path sp : sourcePaths) {

                // copy all of it.
                FileUtil.copy(dfs, sp, dfs, pathTo, false, conf);
            }
        }

    } catch (Exception e) {
        throw new RuntimeException("Could not back up data into directory " + toHere, e);
    }
}

From source file:simsql.code_generator.MyPhysicalDatabase.java

License:Apache License

public void restoreFrom(String fromHere) {

    try {/*  w  ww  . j av a  2  s  .c om*/

        // first, we see if the directory that we are reading from
        // exists and is a directory.
        Configuration conf = new Configuration();
        FileSystem dfs = FileSystem.get(conf);

        Path pathFrom = new Path(fromHere);
        if (!dfs.exists(pathFrom) || !dfs.isDirectory(pathFrom)) {
            System.out.println("The specified restoration path does not exist or is not a directory!");
            return;
        }

        // now, get the destination path.
        Path pathTo = new Path(myDir);
        if (dfs.exists(pathTo)) {

            // destroy it, if it's there.
            dfs.delete(pathTo, true);
        }

        // make the directory
        // dfs.mkdirs(pathTo);

        // and all the paths we will be copying
        Path[] sourcePaths = FileUtil.stat2Paths(dfs.globStatus(pathFrom), pathFrom);
        for (Path sp : sourcePaths) {

            // restore all of it.
            FileUtil.copy(dfs, sp, dfs, pathTo, false, conf);
        }

    } catch (Exception e) {
        throw new RuntimeException("Could not restore data from directory " + fromHere, e);
    }
}