org.ala.repository.RepositoryImpl.java Source code

Introduction

Here is the source code for org.ala.repository.RepositoryImpl.java
Source

/***************************************************************************
 * Copyright (C) 2009 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/
package org.ala.repository;

import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import javax.inject.Inject;

import org.ala.dao.DocumentDAO;
import org.ala.dao.InfoSourceDAO;
import org.ala.documentmapper.MappingUtils;
import org.ala.model.Document;
import org.ala.util.FileType;
import org.ala.util.GenerateThumbnails;
import org.ala.util.MimeType;
import org.ala.util.RepositoryFileUtils;
import org.ala.util.TurtleUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.codehaus.jackson.map.ObjectMapper;
import org.openrdf.model.impl.BNodeImpl;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.model.impl.StatementImpl;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFWriter;
import org.openrdf.rio.turtle.TurtleWriter;
import org.springframework.stereotype.Component;

/**
 * A simple implementation of a repository that knows how to store
 * documents on the file system and maintain any housekeeping
 * information. 
 * 
 * @author Dave Martin (David.Martin@csiro.au)
 */
@Component("repository") // bean id
public class RepositoryImpl implements Repository {

    protected Logger log = Logger.getLogger(RepositoryImpl.class);

    /** The directory root */
    protected String cacheDirectoryRoot = "/data/bie"; // look for override in properties file

    protected String stagingRoot = "/data/bie-staging";

    protected String baseDirectory;
    /** Max file limit. Typically should be 32,000 on linux */
    public static int MAX_FILES_PER_DIRECTORY = 10000;
    /** Document DAO to be injected */
    @Inject
    protected DocumentDAO documentDao;
    @Inject
    protected InfoSourceDAO infoSourceDAO;
    @Inject
    protected RepositoryFileUtils repoFileUtils;

    //   protected boolean useTurtle = false;

    /**
     * Initialize the file cache. This class has some state to reduce
     * counting the number of files in the current directory.
     * 
     * @throws IOException
     */
    public RepositoryImpl() throws IOException {
        this(null);
    }

    /**
    * Initialise repository with subroot.
    * 
    * @param directorySubRoot
    * @throws IOException
    */
    public RepositoryImpl(String directorySubRoot) throws IOException {
        this(null, directorySubRoot);
    }

    /**
    * Initialise repository with root and subroot.
    * 
    * @param directorySubRoot
    * @throws IOException
    */
    public RepositoryImpl(String directoryRoot, String directorySubRoot) throws IOException {
        if (directoryRoot != null) {
            this.cacheDirectoryRoot = directoryRoot;
        }
        //initialise directory structure
        if (directorySubRoot != null) {
            this.baseDirectory = cacheDirectoryRoot + directorySubRoot;
        } else {
            this.baseDirectory = cacheDirectoryRoot;
        }
        log.info("Initialising repository.... baseDirectory = " + baseDirectory);
        //create /data/bie
        File repositoryRoot = new File(this.baseDirectory);
        if (!repositoryRoot.exists()) {
            FileUtils.forceMkdir(repositoryRoot);
            log.info("Repository root created at: " + repositoryRoot.getAbsolutePath());
        }
    }

    /**
     * @see org.ala.repository.Repository#getDocumentByGuid(java.lang.String)
     */
    @Override
    public Document getDocumentByGuid(String guid) throws Exception {
        return documentDao.getByUri(guid);
    }

    /**
     * @see org.ala.repository.Repository#getDocumentOutputStream(int, java.lang.String, java.lang.String)
     */
    @Override
    public DocumentOutputStream getDocumentOutputStream(int infoSourceId, String guid, String mimeType)
            throws Exception {

        if (StringUtils.trimToNull(guid) == null) {
            throw new IllegalArgumentException(
                    "Supplied GUID is empty or null. A stored document must have a non-null identifier.");
        }

        Document doc = documentDao.getByUri(guid);
        File file = null;

        if (doc == null) {

            doc = new Document();
            doc.setInfoSourceId(infoSourceId);
            doc.setUri(guid);
            doc.setMimeType(mimeType);

            //store in database
            documentDao.save(doc);

            //update the filepath
            File directory = getDirectoryForNewDoc(infoSourceId, doc.getId());
            file = getOutputFile(directory, FileType.RAW, mimeType);
            doc.setFilePath(directory.getAbsolutePath());
            documentDao.update(doc);
            doc = documentDao.getByUri(guid);

        } else {
            //overwrite...
            documentDao.update(doc);
            File directory = new File(doc.getFilePath());
            file = getOutputFile(directory, FileType.RAW, mimeType);
        }

        //set up the output stream
        DocumentOutputStream dos = new DocumentOutputStream();
        dos.setId(doc.getId());
        dos.setInfoSourceId(infoSourceId);
        dos.setOutputStream(new FileOutputStream(file));

        return dos;
    }

    /**
     * @see org.ala.repository.Repository#getRDFOutputStream(int)
     */
    @Override
    public DocumentOutputStream getRDFOutputStream(int documentId) throws Exception {
        File file = getRDFOutputFile(documentId);
        FileOutputStream fOut = new FileOutputStream(file);

        DocumentOutputStream dos = new DocumentOutputStream();
        dos.setId(documentId);
        //      dos.infoSourceId
        dos.setOutputStream(fOut);
        return dos;
    }

    /**
     * Save some dublin core information for a document.
     * 
     * TODO 
     * 
      * @param dcProperties
      * @throws IOException
     */
    @Override
    public void storeDublinCore(int documentId, Map<String, String> dcProperties) throws IOException {
        // add DC properties to an ordered Map to then serialise into JSON
        Map<String, String> dcMap = new LinkedHashMap<String, String>();
        Iterator<String> keys = dcProperties.keySet().iterator();

        while (keys.hasNext()) {
            String key = keys.next();
            dcMap.put(key, dcProperties.get(key));
        }

        //add source
        Date now = new Date();
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); // ISO format
        dcMap.put(Predicates.DC_MODIFIED.toString(), sdf.format(now)); // "modified"

        //write out as JSON
        ObjectMapper o = new ObjectMapper();
        String dublinCore = o.writeValueAsString(dcMap);
        log.debug("DC json: " + dublinCore);

        //write to file
        storeMetadata(documentId, dublinCore.getBytes(), FileType.DC.toString());
    }

    /**
     * Store this metadata in a file in the saem directory as the document
     * with the supplied document id.
     * 
     * @param documentId
     * @param content
     * @param metadataFileName
     * @throws IOException
     */
    public void storeMetadata(int documentId, byte[] content, String metadataFileName) throws IOException {

        Document doc = documentDao.getById(documentId);
        String directory = doc.getFilePath();

        String fullFilePath = directory + File.separator + metadataFileName;
        File file = new File(fullFilePath);
        if (file.exists()) {
            FileUtils.forceDelete(file);
        }

        FileUtils.writeByteArrayToFile(file, content);
    }

    /**
     * @see org.ala.repository.Repository#storeDocument(int, org.ala.repository.ParsedDocument)
     */
    public Document storeDocument(String uid, ParsedDocument parsedDocument) throws Exception {
        Document doc = null;
        Integer infoSourceId = infoSourceDAO.getInfosourceIdByUid(uid);
        String guid = parsedDocument.getGuid();

        log.debug("***infoSourceId: " + infoSourceId);
        if (infoSourceId != null && guid != null) {
            /*
            if(guid.trim().startsWith("http://www.flickr.com")){
               doc = documentDao.getByUri(guid);
               log.debug("**** existing flickr image docId: " + doc);
               // if no flickr image in bie then add new image otherwise update rdf with occurrenceUid
               if(doc == null){
                  doc = storeDocument(infoSourceId, parsedDocument);
               }
               else{
                  List<Triple<String, String, String>> triples = new ArrayList<Triple<String, String, String>>();               
                  List<Triple<String, String, String>> pTriples = parsedDocument.getTriples();
                  List<Triple<String, String, String>> rdf = null;
                  Map<String,String> dc = null;
                  try{
              rdf = readRdfFile(doc);
              dc = readDcFile(doc);
                  }
                  catch(Exception ex){
              //do nothing
              log.error("*** rdf or dc file: " + ex);
                  }
                  if(rdf != null && dc != null){
              for(Triple<String, String, String> triple : pTriples){
                 if(Predicates.OCCURRENCE_UID.toString().equals(triple.getPredicate())){
                    triples.add(triple);
                    break;
                 }
              }
              rdf = updateTriple(triples, rdf);
              storeRDF(doc.getId(), rdf);
                  }
                  // image directory deleted??, then add new image
                  else{
              doc = storeDocument(infoSourceId, parsedDocument);   
                  }
               }
            }
            // not flickr image
            else{
               doc = storeDocument(infoSourceId, parsedDocument);
            }
            */

            // no image exist check... it always update/create parsedDucument into exist or non exist image
            doc = storeDocument(infoSourceId, parsedDocument);
        } else {
            throw new NullPointerException("No infosourceId match with uid: " + uid);
        }
        return doc;
    }

    /**
     * @see org.ala.repository.Repository#storeDocument(int, org.ala.repository.ParsedDocument)
     */
    public Document storeDocument(int infoSourceId, ParsedDocument parsedDocument) throws Exception {

        //defensively...
        if (parsedDocument == null) {
            return null;
        }

        //      System.out.println("GUID: " + parsedDocument.getGuid());
        Document parentDoc = documentDao.getByUri(parsedDocument.getParentGuid());
        // store the original document
        Document doc = null;

        if (parentDoc != null) {
            doc = storeDocument(infoSourceId, parsedDocument.getGuid(), parsedDocument.getContent(),
                    parsedDocument.getContentType(), parentDoc.getId(), parsedDocument.getScreenShot());
        } else {
            doc = storeDocument(infoSourceId, parsedDocument.getGuid(), parsedDocument.getContent(),
                    parsedDocument.getContentType(), null, parsedDocument.getScreenShot());
        }

        // store triples
        // TODO add any additional triples from the sitemap
        storeRDF(doc.getId(), parsedDocument.getTriples());
        // retrieve the Map of DC properties
        Map<String, String> dcProperties = parsedDocument.getDublinCore();

        if (doc.getInfoSourceName() != null && doc.getInfoSourceUri() != null) {
            // Add dc:publisher & dc:source to properties
            dcProperties.put(Predicates.DC_PUBLISHER.toString(), doc.getInfoSourceName()); // "dc:publisher"
            dcProperties.put(Predicates.DC_SOURCE.toString(), doc.getInfoSourceUri()); // "dc:source"
        }
        // store dublin core properties
        storeDublinCore(doc.getId(), dcProperties);

        return doc;
    }

    /**
     * Store the supplied file.
     * 
     * @param content
     * @throws IOException
     */
    @Override
    public Document storeDocument(int infoSourceId, String guid, byte[] content, String mimeType,
            Integer parentDocumentId) throws IOException {
        return storeDocument(infoSourceId, guid, content, mimeType, parentDocumentId, null);
    }

    /**
     * Store the supplied file.
     * 
     * @param content
     * @throws IOException
     */
    @Override
    public Document storeDocument(int infoSourceId, String guid, byte[] content, String mimeType,
            Integer parentDocumentId, String screenshot) throws IOException {

        if (StringUtils.trimToNull(guid) == null) {
            throw new IllegalArgumentException(
                    "Supplied GUID is empty or null. A stored document must have a non-null identifier.");
        }

        Document doc = documentDao.getByUri(guid);

        if (doc == null) {
            //create
            log.debug("Creating new document for : " + guid);
            doc = new Document();
            doc.setParentDocumentId(parentDocumentId);
            doc.setInfoSourceId(infoSourceId);
            doc.setUri(guid);
            doc.setMimeType(mimeType);
            doc.setParentDocumentId(parentDocumentId);
            //         log.info("PID" + doc.getParentDocumentId());
            //store in database
            documentDao.save(doc);

            //update the filepath
            File directory = getDirectoryForNewDoc(infoSourceId, doc.getId());
            if (screenshot == null) {
                saveContent(directory, content, FileType.RAW, mimeType);
            } else {
                saveContent(directory, content, FileType.SCREENSHOT, mimeType);
            }
            doc.setFilePath(directory.getAbsolutePath());
            documentDao.update(doc);

        } else {
            //overwrite...
            log.debug("Updating document: " + doc);
            //         log.info("PID UP" + doc.getParentDocumentId());
            if (doc.getFilePath() != null) {
                File directory = new File(doc.getFilePath());
                if (screenshot == null) {
                    saveContent(directory, content, FileType.RAW, mimeType);
                } else {
                    saveContent(directory, content, FileType.SCREENSHOT, mimeType);
                }
            } else {
                //if something has gone wrong during harvest, filepath may be null
                File directory = getDirectoryForNewDoc(infoSourceId, doc.getId());
                if (screenshot == null) {
                    saveContent(directory, content, FileType.RAW, mimeType);
                } else {
                    saveContent(directory, content, FileType.SCREENSHOT, mimeType);
                }
                doc.setFilePath(directory.getAbsolutePath());
            }
            doc.setParentDocumentId(parentDocumentId);
            documentDao.update(doc);
        }

        // Refresh document values (infosource name & uri) by DB lookup
        return documentDao.getByUri(guid);
    }

    /**
     * Store the raw byte content for this document.
     * 
     * @param directory
     * @param content
     * @return
     * @throws UnsupportedEncodingException
     * @throws IOException
     */
    private File saveContent(File directory, byte[] content, FileType fileType, String contentType)
            throws UnsupportedEncodingException, IOException {

        File file = getOutputFile(directory, fileType, contentType);
        FileUtils.writeByteArrayToFile(file, content);

        try {
            //generate a thumbnail if the mime type indicate an image
            if (MimeType.getImageMimeTypes().contains(contentType)) {
                GenerateThumbnails.generateThumbnail(file, fileType, contentType, true, false, false);
            }
        } catch (Exception e) {
            log.error("Problem generating a thumbail for " + file.getAbsolutePath() + " " + e.getMessage(), e);
        }
        return file;
    }

    /**
     * Get an output file to write the raw content to for this document.
     * 
     * @param directory
     * @param contentType
     * @return
     * @throws IOException
     */
    private File getOutputFile(File directory, FileType fileType, String contentType) throws IOException {
        if (!directory.exists()) {
            FileUtils.forceMkdir(directory);
        }

        //store the raw file
        File file = new File(
                directory.getAbsolutePath() + File.separator + fileType + MimeType.getFileExtension(contentType));
        if (file.exists()) {
            FileUtils.forceDelete(file);
        }
        file.createNewFile();
        return file;
    }

    /**
     * Retrieve the next available directory for this infosource.
     * 
     * @param infoSourceId
     * @return
     * @throws IOException
     */
    private File getDirectoryForNewDoc(int infoSourceId, int documentId) throws IOException {
        File directory = new File(cacheDirectoryRoot + File.separator + infoSourceId + File.separator
                + documentId / MAX_FILES_PER_DIRECTORY + File.separator + documentId);
        return directory;
    }

    /**
     * Store these triples in rdf/xml or n3 or N-Triple format.
     * 
      * @param triples 
     */
    @Override
    public void storeRDF(int documentId, List<Triple<String, String, String>> triples) throws Exception {

        File file = getRDFOutputFile(documentId);
        FileWriter fw = new FileWriter(file);

        //      if(useTurtle){
        serialiseAsTurtle(fw, triples);
        //      } else {
        //         serialiseAsTab(fw, triples);
        //      }

        fw.flush();
        fw.close();
    }

    /**
     * Get the RDF output file for this document.
     * 
     * @param documentId
     * @return
     * @throws IOException
     */
    private File getRDFOutputFile(int documentId) throws IOException {
        Document doc = documentDao.getById(documentId);
        String filePath = doc.getFilePath();
        File directory = new File(filePath);
        if (!directory.exists()) {
            FileUtils.forceMkdir(directory);
        }

        //store the raw file
        File file = new File(directory.getAbsolutePath() + File.separator + FileType.RDF);
        if (file.exists()) {
            FileUtils.forceDelete(file);
        }

        file.createNewFile();
        return file;
    }

    /**
     * Serialise the triples as subject \t predicate \t object
     * Similar to NTriples.
     * 
     * @param writer
     * @param triples
     * @throws Exception
     */
    private void serialiseAsTab(Writer writer, List<Triple<String, String, String>> triples) throws Exception {
        for (Triple<String, String, String> triple : triples) {
            writer.write(triple.subject);
            writer.write('\t');
            writer.write(triple.predicate);
            writer.write('\t');
            writer.write(triple.object.toString());
            writer.write('\n');
        }
    }

    /**
     * Serialise the triples in turtle format.
     * 
     * See http://www.w3.org/TeamSubmission/turtle/
     * 
     * @param writer
     * @param triples
     * @throws RDFHandlerException
     */
    private void serialiseAsTurtle(Writer writer, List<Triple<String, String, String>> triples)
            throws RDFHandlerException {
        final RDFWriter rdfWriter = new TurtleWriter(writer);
        rdfWriter.startRDF();
        for (Triple<String, String, String> triple : triples) {
            rdfWriter.handleStatement(new StatementImpl(new BNodeImpl(triple.subject.toString()),
                    new URIImpl(triple.predicate.toString()), new LiteralImpl(triple.object.toString())));
        }
        rdfWriter.endRDF();
    }

    /**
     * @param cacheDirectoryRoot the cacheDirectoryRoot to set
     */
    public void setCacheDirectoryRoot(String cacheDirectoryRoot) {
        this.cacheDirectoryRoot = cacheDirectoryRoot;
    }

    /**
     * @param baseDirectory the baseDirectory to set
     */
    public void setBaseDirectory(String baseDirectory) {
        this.baseDirectory = baseDirectory;
    }

    /**
     * @param documentDao the documentDao to set
     */
    public void setDocumentDao(DocumentDAO documentDao) {
        this.documentDao = documentDao;
    }
    //
    //   /**
    //    * @return the useTurtle
    //    */
    //   public boolean isUseTurtle() {
    //      return useTurtle;
    //   }
    //
    //   /**
    //    * @param useTurtle the useTurtle to set
    //    */
    //   public void setUseTurtle(boolean useTurtle) {
    //      this.useTurtle = useTurtle;
    //   }

    /**
     * @return the stagingRoot
     */
    public String getStagingRoot() {
        return stagingRoot;
    }

    /**
     * @param stagingRoot the stagingRoot to set
     */
    public void setStagingRoot(String stagingRoot) {
        this.stagingRoot = stagingRoot;
    }

    /**
     * @return the cacheDirectoryRoot
     */
    public String getCacheDirectoryRoot() {
        return cacheDirectoryRoot;
    }

    public List<Triple<String, String, String>> readRdfFile(Document doc) {
        List<Triple<String, String, String>> triples = null;
        FileReader reader = null;

        if (doc == null) {
            return null;
        }

        //get RDF file content and populate uploadItem.
        try {
            String repoLocation = doc.getFilePath();
            File rdfFile = new File(repoLocation + "/rdf");
            reader = new FileReader(rdfFile);
            List<Triple<String, String, String>> rdf = TurtleUtils.readTurtle(reader, false);
            if (rdf != null) {
                triples = new ArrayList<Triple<String, String, String>>();
                for (Triple<String, String, String> triple : rdf) {
                    triples.add(new Triple<String, String, String>(MappingUtils.getSubject(), triple.predicate,
                            triple.object));
                }
            }
        } catch (Exception e) {
            log.error("readRdfFile(): " + e.toString());
        } finally {
            try {
                if (reader != null)
                    reader.close();
            } catch (IOException e) {
                log.error("readRdfFile(): " + e.toString());
            }
        }
        return triples;
    }

    private List<Triple<String, String, String>> updateTriple(List<Triple<String, String, String>> from,
            List<Triple<String, String, String>> to) {
        List<Triple<String, String, String>> list = null;
        Map<String, Triple<String, String, String>> map = new Hashtable<String, Triple<String, String, String>>();

        for (Triple<String, String, String> triple : to) {
            map.put(triple.getPredicate(), triple);
        }

        for (Triple<String, String, String> triple : from) {
            if (map.containsKey(triple.getPredicate())) {
                map.remove(triple.getPredicate());
            }
            map.put(triple.getPredicate(), triple);
        }

        Collection<Triple<String, String, String>> values = map.values();
        if (values != null) {
            list = new ArrayList<Triple<String, String, String>>(values);
        }
        return list;
    }

    public Map<String, String> readDcFile(Document doc) {
        File dcFile = null;
        List<String[]> dcContents = null;
        Map<String, String> dcProperties = null;
        if (doc == null) {
            return null;
        }

        //get RDF file content and populate uploadItem.
        try {
            String repoLocation = doc.getFilePath();
            dcFile = new File(repoLocation + "/dc");

            //get dc file content and populate uploadItem.
            dcContents = new ArrayList<String[]>();
            dcContents = repoFileUtils.readRepositoryFile(dcFile);
            if (dcContents != null) {
                dcProperties = new LinkedHashMap<String, String>();
                for (String[] values : dcContents) {
                    dcProperties.put(values[0], values[1]);
                }
            }
        } catch (Exception e) {
            log.error("readDcFile(): " + e.toString());
        }
        return dcProperties;
    }
}