de.julielab.jcore.reader.file.main.FileReader.java Source code

Java tutorial

Introduction

Here is the source code for de.julielab.jcore.reader.file.main.FileReader.java

Source

/**
 * FileReader.java
 *
 * Copyright (c) 2015, JULIE Lab.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser General Public License (LGPL) v3.0
 *
 * Author: muehlhausen
 *
 * Current version: 1.0
 * Since version:   1.0
 *
 * Creation date: 27.08.2007
 *
 * A UIMA <code>CollectionReader</code> that reads in simple text from a file. Derived form Apache UIMA example <code>FileSystemCollectionReader</code>.
 **/

package de.julielab.jcore.reader.file.main;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.julielab.jcore.types.Date;
import de.julielab.jcore.types.pubmed.Header;

public class FileReader extends CollectionReader_ImplBase {

    public static final String DIRECTORY_INPUT = "InputDirectory";
    public static final String FILENAME_AS_DOC_ID = "UseFilenameAsDocId";
    public static final String PUBLICATION_DATES_FILE = "PublicationDatesFile";
    public static final String ALLOWED_FILE_EXTENSIONS = "AllowedFileExtensions";

    private ArrayList<File> files;

    private int fileIndex;

    private boolean useFilenameAsDocId;
    private File publicationDatesFile;

    /**
     * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
     */
    @Override
    public void initialize() throws ResourceInitializationException {

        File inputDirectory = new File(((String) getConfigParameterValue(DIRECTORY_INPUT)).trim());
        if (getConfigParameterValue(PUBLICATION_DATES_FILE) != null) {
            publicationDatesFile = new File(((String) getConfigParameterValue(PUBLICATION_DATES_FILE)).trim());
        }
        Boolean filenameAsDocId = (Boolean) getConfigParameterValue(FILENAME_AS_DOC_ID);
        if (null == filenameAsDocId) {
            useFilenameAsDocId = false;
        } else {
            useFilenameAsDocId = filenameAsDocId;
        }
        String[] allowedExtensionsArray = (String[]) getConfigParameterValue(ALLOWED_FILE_EXTENSIONS);
        final Set<String> allowedExtensions = new HashSet<>();
        if (null != allowedExtensionsArray) {
            for (int i = 0; i < allowedExtensionsArray.length; i++) {
                String allowedExtension = allowedExtensionsArray[i];
                allowedExtensions.add(allowedExtension);
            }
        }
        fileIndex = 0;

        if (!inputDirectory.exists() || !inputDirectory.isDirectory()) {
            throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
                    new Object[] { DIRECTORY_INPUT, this.getMetaData().getName(), inputDirectory.getPath() });
        }

        files = new ArrayList<File>();
        File[] f = inputDirectory.listFiles(new FilenameFilter() {

            @Override
            public boolean accept(File dir, String name) {
                if (allowedExtensions.isEmpty())
                    return true;
                String extension = name.substring(name.lastIndexOf('.') + 1);
                return allowedExtensions.contains(extension);
            }
        });
        for (int i = 0; i < f.length; i++) {
            if (!f[i].isDirectory()) {
                files.add(f[i]);
            }
        }
    }

    /**
     * @see org.apache.uima.collection.CollectionReader#hasNext()
     */
    @Override
    public boolean hasNext() {
        return fileIndex < files.size();
    }

    /**
     * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
     */
    @Override
    public void getNext(CAS aCAS) throws IOException, CollectionException {
        JCas jcas;
        try {
            jcas = aCAS.getJCas();
        } catch (CASException e) {
            throw new CollectionException(e);
        }

        // open input stream to file
        File file = files.get(fileIndex++);

        String text = FileUtils.readFileToString(file, "UTF-8");
        // String text = FileUtils.file2String(file);
        // put document in CAS
        jcas.setDocumentText(text);

        if (useFilenameAsDocId) {
            String filename = file.getName();
            int extDotIndex = filename.lastIndexOf('.');
            if (extDotIndex > 0) {
                filename = filename.substring(0, extDotIndex);
            }
            int extUnderScoreIndex = filename.lastIndexOf('_');
            if (extUnderScoreIndex > 0) {
                filename = filename.substring(0, extUnderScoreIndex);
            }

            Header header = new Header(jcas);

            // set ID
            header.setDocId(filename);

            // set publication date
            addDateForID(header, jcas, filename);

            header.addToIndexes();
        }
    }

    /**
     * if publicationDatesFile is available, this method retrieves the
     * publication date for the given article id and adds it to the passed
     * header
     * 
     * @param header
     * @param id
     */
    private void addDateForID(Header header, JCas jCas, String id) {
        if (publicationDatesFile != null && publicationDatesFile.exists() && publicationDatesFile.isFile()) {
            // read the file and search for the given id
            try {
                BufferedReader br = new BufferedReader(new java.io.FileReader(publicationDatesFile));
                String line = "";
                while ((line = br.readLine()) != null) {
                    // split line at whitespace
                    String[] tokens = line.split("\\s+");
                    if (tokens.length == 2 && tokens[0].equals(id) && tokens[1].length() == 7) {
                        Date pubDate = new Date(jCas);
                        // extract year and month from token of format 2002-01
                        int year = 0;
                        int month = 0;
                        try {
                            year = Integer.parseInt(tokens[1].substring(0, 4));
                            month = Integer.parseInt(tokens[1].substring(5));
                        } catch (NumberFormatException e) {
                        }
                        if (month != 0) {
                            pubDate.setMonth(month);
                        }
                        if (year != 0) {
                            pubDate.setYear(year);
                            pubDate.addToIndexes();
                            // TODO, why doesn't this work??
                            // header.setDate(pubDate);
                        }
                        break;
                    }
                }
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    /**
     * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
     */
    @Override
    public void close() throws IOException {
    }

    /**
     * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
     */
    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(fileIndex, files.size(), Progress.ENTITIES) };
    }
}