marmot.tokenize.preprocess.WikiReader.java Source code

Java tutorial

Introduction

Here is the source code for marmot.tokenize.preprocess.WikiReader.java

Source

// Copyright 2014 Thomas Mller
// This file is part of MarMoT, which is licensed under GPLv3.

package marmot.tokenize.preprocess;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;

import org.apache.commons.compress.compressors.bzip2.*;

public class WikiReader implements Iterator<Pair> {

    private Pair pair_;
    private InternalReader untokenized_;
    private InternalReader tokenized_;
    private boolean expand_;

    public WikiReader(InternalReader untokenized, InternalReader tokenized, boolean expand) {
        untokenized_ = untokenized;
        tokenized_ = tokenized;
        expand_ = expand;
    }

    public WikiReader(String untokenized_file, String tokenized_file, boolean expand) {
        this(openFile(untokenized_file), openFile(tokenized_file), expand);
    }

    public static InternalReader openFile(String file) {
        try {
            return new BufferedReaderWrapper(new BufferedReader(
                    new InputStreamReader(new BZip2CompressorInputStream(new FileInputStream(file)), "UTF-8")));
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public boolean hasNext() {
        readNext();
        return pair_ != null;
    }

    protected String fixLine(String line) {
        if (line == null) {
            return line;
        }

        line = line.replace((char) 0xa0, ' ');
        return line;
    }

    protected String readNonEmptyLine(InternalReader reader) {
        String line = fixLine(reader.readLine());

        if (line == null) {
            throw new NoSuchElementException();
        }

        line = line.trim();

        while (line.isEmpty()) {
            line = fixLine(reader.readLine());

            if (line == null) {
                throw new NoSuchElementException();
            }

            line = line.trim();

        }
        return line;
    }

    public void readNext() {
        if (pair_ != null) {
            return;
        }

        try {

            String tokenized = readNonEmptyLine(tokenized_);
            String untokenized = readNonEmptyLine(untokenized_);

            pair_ = new Pair(tokenized, untokenized);

            if (expand_)
                expandPair();

            if (pair_.score > 0.7 && pair_.tokenized.length() > 20) {
                throw new RuntimeException(String.format("Alignment error: %s --- %s : %g", pair_.tokenized,
                        pair_.untokenized, pair_.score));
            }

        } catch (NoSuchElementException e) {

        }
    }

    protected void expandPair() {
        Pair pair;
        boolean expanded = false;

        // expand left:

        try {

            tokenized_.mark();
            pair = new Pair(pair_.tokenized + readNonEmptyLine(tokenized_), pair_.untokenized);

            if (pair.score < pair_.score) {
                pair_ = pair;
                expanded = true;
            } else {
                tokenized_.reset(); // not supported in BufferedReaderWrapper!
            }

        } catch (NoSuchElementException e) {

        }

        // expand right:

        untokenized_.mark();
        pair = new Pair(pair_.tokenized, pair_.untokenized + readNonEmptyLine(untokenized_));

        if (pair.score < pair_.score) {
            pair_ = pair;
            expanded = true;
        } else {
            untokenized_.reset(); // not supported in BufferedReaderWrapper!
        }

        if (expanded) {
            expandPair();
        }
    }

    @Override
    public Pair next() {
        readNext();

        if (pair_ == null) {
            throw new NoSuchElementException();
        }

        Pair pair = pair_;
        pair_ = null;
        return pair;
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    public List<Pair> readAll() {
        List<Pair> pairs = new LinkedList<Pair>();
        while (hasNext()) {
            pairs.add(next());
        }
        return pairs;
    }
}