com.romeikat.datamessie.core.base.service.DocumentService.java Source code

Java tutorial

Introduction

Here is the source code for com.romeikat.datamessie.core.base.service.DocumentService.java

Source

package com.romeikat.datamessie.core.base.service;

/*-
 * ============================LICENSE_START============================
 * data.messie (core)
 * =====================================================================
 * Copyright (C) 2013 - 2017 Dr. Raphael Romeikat
 * =====================================================================
 * This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public
License along with this program.  If not, see
<http://www.gnu.org/licenses/gpl-3.0.html>.
 * =============================LICENSE_END=============================
 */
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.Collection;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.mutable.MutableObject;
import org.hibernate.StatelessSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.stereotype.Service;
import com.romeikat.datamessie.core.base.app.shared.IStatisticsManager;
import com.romeikat.datamessie.core.base.app.shared.SharedBeanProvider;
import com.romeikat.datamessie.core.base.dao.impl.CleanedContentDao;
import com.romeikat.datamessie.core.base.dao.impl.CrawlingDao;
import com.romeikat.datamessie.core.base.dao.impl.DocumentDao;
import com.romeikat.datamessie.core.base.dao.impl.RawContentDao;
import com.romeikat.datamessie.core.base.dao.impl.SourceDao;
import com.romeikat.datamessie.core.base.dao.impl.StemmedContentDao;
import com.romeikat.datamessie.core.base.task.management.TaskCancelledException;
import com.romeikat.datamessie.core.base.task.management.TaskExecution;
import com.romeikat.datamessie.core.base.task.management.TaskExecutionWork;
import com.romeikat.datamessie.core.base.util.converter.LocalDateConverter;
import com.romeikat.datamessie.core.base.util.sparsetable.StatisticsRebuildingSparseTable;
import com.romeikat.datamessie.core.domain.entity.impl.CleanedContent;
import com.romeikat.datamessie.core.domain.entity.impl.Document;
import com.romeikat.datamessie.core.domain.entity.impl.RawContent;
import com.romeikat.datamessie.core.domain.entity.impl.StemmedContent;
import com.romeikat.datamessie.core.domain.enums.DocumentProcessingState;

@Service
public class DocumentService {

    private final static Logger LOG = LoggerFactory.getLogger(DocumentService.class);

    @Autowired
    private SharedBeanProvider sharedBeanProvider;

    @Autowired
    @Qualifier("documentDao")
    private DocumentDao documentDao;

    @Autowired
    private RawContentDao rawContentDao;

    @Autowired
    private CleanedContentDao cleanedContentDao;

    @Autowired
    private StemmedContentDao stemmedContentDao;

    @Autowired
    @Qualifier("crawlingDao")
    private CrawlingDao crawlingDao;

    @Autowired
    @Qualifier("sourceDao")
    private SourceDao sourceDao;

    public void createUpdateOrDeleteRawContent(final StatelessSession statelessSession, final long documentId,
            final String content) {
        RawContent rawContent = rawContentDao.getEntity(statelessSession, documentId);

        // Create or update
        if (StringUtils.isNotBlank(content)) {
            if (rawContent == null) {
                rawContent = new RawContent(documentId, content);
                rawContentDao.insert(statelessSession, rawContent);
            } else {
                rawContent.setContent(content);
                rawContentDao.update(statelessSession, rawContent);
            }
        }

        // Delete
        else {
            if (rawContent != null) {
                rawContentDao.delete(statelessSession, rawContent);
            }
        }
    }

    public void createUpdateOrDeleteCleanedContent(final StatelessSession statelessSession, final long documentId,
            final String content) {
        CleanedContent cleanedContent = cleanedContentDao.getEntity(statelessSession, documentId);

        // Create or update
        if (StringUtils.isNotBlank(content)) {
            if (cleanedContent == null) {
                cleanedContent = new CleanedContent(documentId, content);
                cleanedContentDao.insert(statelessSession, cleanedContent);
            } else {
                cleanedContent.setContent(content);
                cleanedContentDao.update(statelessSession, cleanedContent);
            }
        }

        // Delete
        else {
            if (cleanedContent != null) {
                cleanedContentDao.delete(statelessSession, cleanedContent);
            }
        }
    }

    public void createUpdateOrDeleteStemmedContent(final StatelessSession statelessSession, final long documentId,
            final String content) {
        StemmedContent stemmedContent = stemmedContentDao.getEntity(statelessSession, documentId);

        // Create or update
        if (StringUtils.isNotBlank(content)) {
            if (stemmedContent == null) {
                stemmedContent = new StemmedContent(documentId, content);
                stemmedContentDao.insert(statelessSession, stemmedContent);
            } else {
                stemmedContent.setContent(content);
                stemmedContentDao.update(statelessSession, stemmedContent);
            }
        }

        // Delete
        else {
            if (stemmedContent != null) {
                stemmedContentDao.delete(statelessSession, stemmedContent);
            }
        }
    }

    public void markDocumentsToBeDeleted(final StatelessSession statelessSession,
            final Collection<Document> documents) {
        for (final Document document : documents) {
            markDocumentToBeDeleted(statelessSession, document);
        }
    }

    private void markDocumentToBeDeleted(final StatelessSession statelessSession, final Document document) {
        if (document == null) {
            return;
        }

        LOG.info("Document {} to be deleted", document.getId());
        document.setState(DocumentProcessingState.TO_BE_DELETED);
        documentDao.update(statelessSession, document);
    }

    public void deprocessDocumentsOfSource(final StatelessSession statelessSession,
            final TaskExecution taskExecution, final long sourceId, final DocumentProcessingState targetState)
            throws TaskCancelledException {
        // Initialize
        final TaskExecutionWork work = taskExecution.reportWorkStart(
                String.format("Deprocessing documents of source %s to state %s", sourceId, targetState.getName()));
        final StatisticsRebuildingSparseTable statisticsToBeRebuilt = new StatisticsRebuildingSparseTable();

        // Determine minimum downloaded date
        final LocalDate minDownloadedDate = getMinDownloadedDate(statelessSession);

        // Process all download dates one after another, starting with the minimum downloaded date
        final MutableObject<LocalDate> downloadedDate = new MutableObject<LocalDate>(minDownloadedDate);
        while (downloadedDate.getValue() != null) {
            // Deprocess
            deprocessDocumentsOfSourceAndDownloadDate(statelessSession, taskExecution, sourceId, targetState,
                    statisticsToBeRebuilt, downloadedDate.getValue());

            // Prepare for next iteration
            final LocalDate nextDownloadedDate = getNextDownloadedDate(downloadedDate.getValue());
            downloadedDate.setValue(nextDownloadedDate);
        }

        // Rebuild statistics
        final IStatisticsManager statisticsManager = sharedBeanProvider.getSharedBean(IStatisticsManager.class);
        if (statisticsManager != null) {
            statisticsManager.rebuildStatistics(statisticsToBeRebuilt);
        }

        // Done
        taskExecution.reportWorkEnd(work);
    }

    private LocalDate getMinDownloadedDate(final StatelessSession statelessSession) {
        final LocalDateTime minDownloadedDateTime = documentDao.getMinDownloaded(statelessSession);
        if (minDownloadedDateTime == null) {
            return null;
        }

        final LocalDate minDownloadedDate = minDownloadedDateTime.toLocalDate();
        return minDownloadedDate;
    }

    private void deprocessDocumentsOfSourceAndDownloadDate(final StatelessSession statelessSession,
            final TaskExecution taskExecution, final long sourceId, final DocumentProcessingState targetState,
            final StatisticsRebuildingSparseTable statisticsToBeRebuilt, final LocalDate downloadedDate)
            throws TaskCancelledException {
        final TaskExecutionWork work2 = taskExecution
                .reportWorkStart(String.format("Deprocessing documents with download date %s",
                        LocalDateConverter.INSTANCE_UI.convertToString(downloadedDate)));

        // Load
        final List<Document> documents = documentDao.getForSourceAndDownloaded(statelessSession, sourceId,
                downloadedDate);

        // Deprocess
        deprocessDocuments(statelessSession, documents, targetState, statisticsToBeRebuilt);

        taskExecution.reportWorkEnd(work2);
        taskExecution.checkpoint();
    }

    private void deprocessDocuments(final StatelessSession statelessSession, final Collection<Document> documents,
            final DocumentProcessingState targetState,
            final StatisticsRebuildingSparseTable statisticsToBeRebuilt) {
        for (final Document document : documents) {
            LOG.debug("Deprocessing document {}", document.getId());
            // Update state
            if (document != null) {
                final DocumentProcessingState oldState = document.getState();
                if (!isValidDeprocessingStep(oldState, targetState)) {
                    continue;
                }
                document.setState(targetState);
                documentDao.update(statelessSession, document);
                // Rebuild statistics
                final long sourceId = document.getSourceId();
                final LocalDate publishedDate = document.getPublishedDate();
                statisticsToBeRebuilt.putValue(sourceId, publishedDate, true);
            }
        }
    }

    private boolean isValidDeprocessingStep(final DocumentProcessingState sourceState,
            final DocumentProcessingState targetState) {
        // Check target state
        if (DocumentProcessingState.getErrorStates().contains(targetState)) {
            return false;
        }
        // Check source -> target states
        switch (sourceState) {
        case DOWNLOADED:
        case DOWNLOAD_ERROR:
            return false;
        case REDIRECTED:
        case REDIRECTING_ERROR:
            return targetState == DocumentProcessingState.DOWNLOADED;
        case CLEANED:
            return targetState == DocumentProcessingState.DOWNLOADED
                    || targetState == DocumentProcessingState.REDIRECTED;
        case CLEANING_ERROR:
        case STEMMED:
            return targetState == DocumentProcessingState.DOWNLOADED
                    || targetState == DocumentProcessingState.REDIRECTED
                    || targetState == DocumentProcessingState.CLEANED;
        case TECHNICAL_ERROR:
            return targetState == DocumentProcessingState.DOWNLOADED
                    || targetState == DocumentProcessingState.REDIRECTED
                    || targetState == DocumentProcessingState.CLEANED
                    || targetState == DocumentProcessingState.STEMMED;
        case TO_BE_DELETED:
            return false;
        }
        // Done
        return true;
    }

    private LocalDate getNextDownloadedDate(final LocalDate downloadedDate) {
        // Increase only up to current date
        final LocalDate now = LocalDate.now();
        if (downloadedDate.isAfter(now)) {
            return null;
        }
        // Otherwise, go to next date
        return downloadedDate.plusDays(1);
    }

}