List of usage examples for org.apache.hadoop.fs FileSystem getLength
@Deprecated public long getLength(Path f) throws IOException
From source file:org.commoncrawl.service.queryserver.query.DomainURLListQuery.java
License:Open Source License
@Override public void getCachedResults(FileSystem fileSyste, Configuration conf, EventLoop eventLoop, final DatabaseIndexV2.MasterDatabaseIndex masterIndex, QueryRequest<DomainURLListQueryInfo, URLFPV2, CrawlDatumAndMetadata> theClientRequest, QueryCompletionCallback<DomainURLListQueryInfo, URLFPV2, CrawlDatumAndMetadata> callback) throws IOException { LOG.info("getCachedResults for Query:" + getQueryId() + " Retrieving Cached Results"); FileSystem localFileSystem = FileSystem.getLocal(conf); Path outputFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + getURLOutputFileNameBasedOnSortByField(theClientRequest.getClientQueryInfo().getSortByField())); FSDataInputStream inputStream = localFileSystem.open(outputFileName); try {//from ww w . jav a 2 s . com QueryResult<URLFPV2, CrawlDatumAndMetadata> resultOut = new QueryResult<URLFPV2, CrawlDatumAndMetadata>(); //LOG.info("Calling ReadPaginationResults"); readPaginatedResults(masterIndex, getQueryData().getDomainId(), inputStream, localFileSystem.getLength(outputFileName), theClientRequest.getClientQueryInfo().getSortByField(), theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); //LOG.info("Initiating getCachedResults Callback"); callback.queryComplete(theClientRequest, resultOut); } finally { inputStream.close(); } }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir();// w ww . j av a 2 s. co m try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs.getLength(indexFilePath), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes.getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
@Override public void getCachedResults(FileSystem fileSyste, Configuration conf, EventLoop eventLoop, final DatabaseIndexV2.MasterDatabaseIndex masterIndex, QueryRequest<URLLinkDetailQueryInfo, Writable, Writable> theClientRequest, QueryCompletionCallback<URLLinkDetailQueryInfo, Writable, Writable> callback) throws IOException { LOG.info("getCachedResults for Query:" + getQueryId() + " Retrieving Cached Results"); FileSystem localFileSystem = FileSystem.getLocal(conf); Path cacheDataFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA"); Path cacheDataIndexFileName = new Path(getLocalQueryResultsPathPrefix(theClientRequest) + "DATA.index"); QueryResult<Writable, Writable> resultOut = new QueryResult<Writable, Writable>(); if (getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.LINKS_QUERY || getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_QUERY) { FSDataInputStream inputStream = localFileSystem.open(cacheDataFileName); try {// ww w. ja v a 2 s . co m //LOG.info("Calling ReadPaginationResults"); readPaginatedResults(masterIndex, inputStream, localFileSystem.getLength(cacheDataFileName), theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); } finally { inputStream.close(); } } else if (getQueryData().getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_QUERY) { readPaginatedInlinkingDomainInfo(masterIndex, localFileSystem, cacheDataIndexFileName, cacheDataFileName, theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); } else if (getQueryData() .getQueryType() == URLLinkDetailQueryInfo.QueryType.INVERSE_BY_DOMAIN_DETAIL_QUERY) { InlinkingDomainInfo domainInfo = new InlinkingDomainInfo(); domainInfo.setUrlCount(getQueryData().getInlinkDomainURLCount()); domainInfo.setUrlDataPos(getQueryData().getUrlDataOffset()); readPaginatedInlinkingDomainDetail(masterIndex, localFileSystem, cacheDataFileName, domainInfo, theClientRequest.getClientQueryInfo().getSortOrder(), theClientRequest.getClientQueryInfo().getPaginationOffset(), theClientRequest.getClientQueryInfo().getPageSize(), resultOut); } //LOG.info("Initiating getCachedResults Callback"); callback.queryComplete(theClientRequest, resultOut); }