io.anserini.collection.TrecCollection.java Source code

Introduction

Here is the source code for io.anserini.collection.TrecCollection.java
Source

/**
 * Anserini: An information retrieval toolkit built on Lucene
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.collection;

import io.anserini.document.TrecDocument;
import org.apache.commons.compress.compressors.z.ZCompressorInputStream;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPInputStream;

/**
 * Class representing an instance of a TREC collection.
 */
public class TrecCollection<D extends TrecDocument> extends Collection {

    public class FileSegment extends Collection.FileSegment {
        protected BufferedReader bufferedReader;
        protected final int BUFFER_SIZE = 1 << 16; // 64K

        protected FileSegment() {
        }

        protected FileSegment(Path path) throws IOException {
            this.path = path;
            this.bufferedReader = null;
            String fileName = path.toString();
            if (fileName.matches(".*?\\.\\d*z$")) { // .z .0z .1z .2z
                FileInputStream fin = new FileInputStream(fileName);
                BufferedInputStream in = new BufferedInputStream(fin);
                ZCompressorInputStream zIn = new ZCompressorInputStream(in);
                bufferedReader = new BufferedReader(new InputStreamReader(zIn, StandardCharsets.UTF_8));
            } else if (fileName.endsWith(".gz")) { //.gz
                InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ),
                        BUFFER_SIZE);
                bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
            } else { // plain text file
                bufferedReader = new BufferedReader(new FileReader(fileName));
            }
        }

        @Override
        public void close() throws IOException {
            atEOF = false;
            if (bufferedReader != null) {
                bufferedReader.close();
            }
        }

        @Override
        public boolean hasNext() {
            return !atEOF;
        }

        @Override
        public D next() {
            TrecDocument doc = new TrecDocument();
            try {
                doc = (TrecDocument) doc.readNextRecord(bufferedReader);
                if (doc == null) {
                    atEOF = true;
                    doc = null;
                }
            } catch (IOException e1) {
                doc = null;
            }
            return (D) doc;
        }
    }

    @Override
    public List<Path> getFileSegmentPaths() {
        Set<String> skippedFilePrefix = new HashSet<>(Arrays.asList("readme"));
        Set<String> skippedDirs = new HashSet<>(Arrays.asList("cr", "dtd", "dtds"));

        return discover(path, skippedFilePrefix, EMPTY_SET, EMPTY_SET, EMPTY_SET, skippedDirs);
    }

    @Override
    public Collection.FileSegment createFileSegment(Path p) throws IOException {
        return new FileSegment(p);
    }
}