Example usage for org.apache.hadoop.fs FileSystem getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getLength.

Prototype

@Deprecated
public long getLength(Path f) throws IOException

Source Link

Document

The number of bytes in a file.

Usage

From source file:org.commoncrawl.service.queryserver.query.DomainURLListQuery.java

License:Open Source License

@Override
public void getCachedResults(FileSystem fileSyste, Configuration conf, EventLoop eventLoop,
        final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        QueryRequest<DomainURLListQueryInfo, URLFPV2, CrawlDatumAndMetadata> theClientRequest,
        QueryCompletionCallback<DomainURLListQueryInfo, URLFPV2, CrawlDatumAndMetadata> callback)
        throws IOException {
    LOG.info("getCachedResults for Query:" + getQueryId() + " Retrieving Cached Results");

    FileSystem localFileSystem = FileSystem.getLocal(conf);

    Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest)
            + getURLOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField()));

    FSDataInputStream inputStream = localFileSystem.open(outputFileName);

    try {//from   ww w  . jav  a 2  s  .  com
        QueryResult<URLFPV2, CrawlDatumAndMetadata> resultOut = new QueryResult<URLFPV2, CrawlDatumAndMetadata>();

        //LOG.info("Calling ReadPaginationResults");
        readPaginatedResults(masterIndex, getQueryData().getDomainId(), inputStream,
                localFileSystem.getLength(outputFileName),
                theClientRequest.getClientQueryInfo().getSortByField(),
                theClientRequest.getClientQueryInfo().getSortOrder(),
                theClientRequest.getClientQueryInfo().getPaginationOffset(),
                theClientRequest.getClientQueryInfo().getPageSize(), resultOut);

        //LOG.info("Initiating getCachedResults Callback");
        callback.queryComplete(theClientRequest, resultOut);

    } finally {
        inputStream.close();
    }
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();//  w ww .  j  av  a 2 s.  co m

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

@Override
public void getCachedResults(FileSystem fileSyste, Configuration conf, EventLoop eventLoop,
        final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        QueryRequest<URLLinkDetailQueryInfo, Writable, Writable> theClientRequest,
        QueryCompletionCallback<URLLinkDetailQueryInfo, Writable, Writable> callback) throws IOException {
    LOG.info("getCachedResults for Query:" + getQueryId() + " Retrieving Cached Results");

    FileSystem localFileSystem = FileSystem.getLocal(conf);

    Path cacheDataFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA");
    Path cacheDataIndexFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA.index");

    QueryResult<Writable, Writable> resultOut = new QueryResult<Writable, Writable>();

    if (getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.LINKS_QUERY
            || getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_QUERY) {

        FSDataInputStream inputStream = localFileSystem.open(cacheDataFileName);

        try {// ww w. ja v a  2  s .  co m
            //LOG.info("Calling ReadPaginationResults");
            readPaginatedResults(masterIndex, inputStream, localFileSystem.getLength(cacheDataFileName),
                    theClientRequest.getClientQueryInfo().getSortOrder(),
                    theClientRequest.getClientQueryInfo().getPaginationOffset(),
                    theClientRequest.getClientQueryInfo().getPageSize(), resultOut);
        } finally {
            inputStream.close();
        }
    } else if (getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_QUERY) {
        readPaginatedInlinkingDomainInfo(masterIndex, localFileSystem, cacheDataIndexFileName,
                cacheDataFileName, theClientRequest.getClientQueryInfo().getSortOrder(),
                theClientRequest.getClientQueryInfo().getPaginationOffset(),
                theClientRequest.getClientQueryInfo().getPageSize(), resultOut);
    } else if (getQueryData()
            .getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY) {

        InlinkingDomainInfo domainInfo = new InlinkingDomainInfo();

        domainInfo.setUrlCount(getQueryData().getInlinkDomainURLCount());
        domainInfo.setUrlDataPos(getQueryData().getUrlDataOffset());

        readPaginatedInlinkingDomainDetail(masterIndex, localFileSystem, cacheDataFileName, domainInfo,
                theClientRequest.getClientQueryInfo().getSortOrder(),
                theClientRequest.getClientQueryInfo().getPaginationOffset(),
                theClientRequest.getClientQueryInfo().getPageSize(), resultOut);

    }

    //LOG.info("Initiating getCachedResults Callback");
    callback.queryComplete(theClientRequest, resultOut);

}