org.shareok.data.plosdata.PlosDoiDataImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.shareok.data.plosdata.PlosDoiDataImpl.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package org.shareok.data.plosdata;

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.shareok.data.datahandlers.exceptions.NoMatchingRegularExpressionException;
import org.shareok.data.datahandlers.exceptions.NotFoundPublisherException;
import org.shareok.data.htmlrequest.HtmlParser;
import org.shareok.data.documentProcessor.ExcelHandler;
import org.shareok.data.documentProcessor.DocumentProcessorUtil;
import org.shareok.data.dspacemanager.DspaceJournalDataUtil;
import org.shareok.data.plosdata.PlosUtil.JournalType;
import org.springframework.web.multipart.MultipartFile;

/**
 *
 * @author Tao Zhao
 */
public class PlosDoiDataImpl implements ExcelData, PlosDoiData {

    private static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(PlosDoiDataImpl.class);

    private HashMap<String, String[]> data;
    private ExcelHandler excelHandler;
    private ArrayList<String> doiData;
    private ArrayList<PlosData> plosDataList;
    private String outputPath;

    public HashMap<String, String[]> getData() {
        return data;
    }

    public ExcelHandler getExcelHandler() {
        return excelHandler;
    }

    /**
     *
     * @param data
     */
    public void setData(HashMap<String, String[]> data) {
        this.data = data;
    }

    public void setExcelHandler(ExcelHandler excelHandler) {
        this.excelHandler = excelHandler;
    }

    public ArrayList<String> getDoiData() {
        return doiData;
    }

    public ArrayList<PlosData> getPlosDataList() {
        return plosDataList;
    }

    public void setPlosDataList(ArrayList<PlosData> plosDataList) {
        this.plosDataList = plosDataList;
    }

    public String getOutputPath() {
        return outputPath;
    }

    public void setOutputPath(String outputPath) {
        this.outputPath = outputPath;
    }

    @Override
    public void addData(int row, int col, Object data) {
        String key = Integer.toString(row) + "-" + Integer.toString(col);
        HashMap dataVal = this.getData();
        dataVal.put(key, data);
    }

    public void setDoiData(ArrayList<String> doiData) {
        this.doiData = doiData;
    }

    @Override
    public void printData() {

    }

    @Override
    public void convertHashMapDataToDoi(HashMap mapData) throws Exception {
        Set keys = mapData.keySet();
        Iterator it = keys.iterator();

        ArrayList<String> doiList = new ArrayList<>();
        while (it.hasNext()) {
            String key = (String) it.next();
            String value = (String) mapData.get(key);
            // the values is composed of "val--datatype": for example, Tom--Str or 0.50--num
            String[] values = value.split("--");
            if (null == values || values.length != 2)
                continue;
            String type = values[1];
            value = values[0];
            String[] rowCol = key.split("-");
            if (null == rowCol || rowCol.length != 2)
                throw new Exception("The row and column are not specifid!");
            int row = Integer.parseInt(rowCol[0]);
            int col = Integer.parseInt(rowCol[1]);
            if (col == 3 && value.contains("/journal.p")) {
                int index = value.toLowerCase().indexOf("plos ");
                if (index == -1)
                    continue;
                String doiVal = value.substring(index);
                Pattern pattern = Pattern.compile("(e)(\\d{1,10})(.)(\\s*)(doi:)");
                Matcher matcher = pattern.matcher(doiVal);
                if (matcher.find()) {
                    String[] doiInfo = doiVal.split(":");
                    if (doiInfo.length != 3)
                        continue;
                    String isPartOfSeries = doiInfo[0] + ":" + matcher.group(1) + matcher.group(2);
                    doiVal = isPartOfSeries + "---" + doiInfo[2];
                    doiList.add(doiVal);
                    //System.out.println("Matcher find the string for "+doiInfo[0]+"!!!  \n");//System.out.println(matcher.group(1) + " - " + matcher.group(2));
                } else {
                    System.out.println("Matcher cannot find the string for " + doiVal + "!!!  \n");
                }
            }
        }
        setDoiData(doiList);
    }

    @Override
    public void getDspaceLoadingData(String fileName) throws Exception {
        importData(fileName);
        ArrayList<String> doiList = getDoiData();
        if (!doiList.isEmpty()) {

            PlosRequest req = (PlosRequest) PlosUtil.getPlosContext().getBean("plosRequest");
            PlosData plosData = (PlosData) PlosUtil.getPlosContext().getBean("plosData");

            for (String doi : doiList) {
                String[] valArray = doi.split("---");
                String isPartOfSeries = valArray[0];
                String[] doiArr = valArray[1].split(":");
                doi = doiArr[0];
                plosData = getDspaceJournalLoadingFilesBySingleDoi(doi);
                plosDataList.add(plosData);
            }
        }
    }

    /**
     * 
     * @param fileName : file path
     */
    @Override
    public void importData(String fileName) {

        //String path = DocumentProcessorUtil.getFilePathFromResources(fileName);
        excelHandler.setFileName(fileName);
        try {
            excelHandler.readData();
            HashMap mapData = excelHandler.getData();
            convertHashMapDataToDoi(mapData);
        } catch (Exception ex) {
            Logger.getLogger(PlosDoiDataImpl.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    @Override
    public void exportXmlData(String filePath) {

    }

    @Override
    public String getDspaceLoadingData(MultipartFile file) {
        String filePath = null;
        filePath = DspaceJournalDataUtil.saveUploadedData(file, "plos");
        setOutputPath(DocumentProcessorUtil.getFileContainerPath(filePath) + File.separator + "output");
        try {
            getDspaceLoadingData(filePath);
        } catch (Exception ex) {
            Logger.getLogger(PlosDoiDataImpl.class.getName()).log(Level.SEVERE, null, ex);
        }
        return filePath;
    }

    private String getArticleOutputFolderPath(String doi) {
        return outputPath + File.separator + doi.split("/")[1];
    }

    private void matchCitationDate(String dateString, PlosData plosData) throws Exception {

        Date date;
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd");
        Pattern pattern = Pattern.compile("(\\d{4}-\\d{2}-\\d{2})|(\\d{4}/\\d{2}/\\d{2})");
        Matcher matcher = pattern.matcher(dateString);
        if (matcher.find()) {
            dateString = matcher.group(0);
            dateString = dateString.replace("-", "/");
            date = sdf.parse(dateString);
        }
        // the date can be in a form of 'MMM dd, yyyy'
        else if (!dateString.equals("")) {
            SimpleDateFormat formatter = new SimpleDateFormat("MMM dd, yyyy"); // 3-letter month name & 2-char day of month
            Date date2 = formatter.parse(dateString);
            String s3 = sdf.format(date2);
            date = sdf.parse(s3);
        } else {
            throw new Exception("Date match not found\n");
        }
        plosData.setDateIssued(date);
    }

    @Override
    public String getDspaceJournalLoadingFilesByDoi(String[] dois, Date time) {
        String uploadPath = null;
        plosDataList.clear();
        uploadPath = DspaceJournalDataUtil.getDspaceJournalUploadPath("plos", time);
        setOutputPath(uploadPath + File.separator + "output_plos");
        for (String doi : dois) {
            getDspaceJournalLoadingFilesBySingleDoi(doi);
        }
        try {
            FileUtils.deleteDirectory(new File(uploadPath + File.separator + "output_plos"));
        } catch (IOException ex) {
            logger.error("Cannot delete the saf folder after being zipped", ex);
        }
        return uploadPath + File.separator + "output_plos.zip";
    }

    private String getIsPartOfSeriesByCitation(String citation) throws NoMatchingRegularExpressionException {
        String isPartOfSeries = null;
        String pattern = "\\.(\\s?)PLoS(\\s?)(.*)(\\d+)(\\((\\d+)\\):(\\s?)e(\\d+))";
        Pattern r = Pattern.compile(pattern);
        Matcher m = r.matcher(citation);
        if (m.find()) {
            isPartOfSeries = m.group(0);
        } else {
            throw new NoMatchingRegularExpressionException("Cannot find match strings at all!");
        }
        return isPartOfSeries.substring(2);
    }

    @Override
    public PlosData getDspaceJournalLoadingFilesBySingleDoi(String doi) {

        PlosRequest req = (PlosRequest) PlosUtil.getPlosContext().getBean("plosRequest");
        PlosData plosData = (PlosData) PlosUtil.getPlosContext().getBean("plosData");

        String doiDataVal = req.getFullData(doi);
        String[] tagNames = { "property" };
        HashMap<String, ArrayList<String>> metaData = HtmlParser.metaDataParserWithTagNames(doiDataVal, tagNames);

        String acknowledgement = PlosUtil.getPlosAck(doiDataVal);
        String citation = PlosUtil.getPlosCitation(doiDataVal);
        String contributions = PlosUtil.getAuthorContributions(doiDataVal);
        String journalTypeString = doi.split("journal.")[1].split("\\.")[0];

        try {
            plosData.setPlosJournalType(journalTypeString);
        } catch (Exception ex) {
            logger.error(ex);
        }
        plosData.setDoi(doi);
        plosData.setRelationUri(req.getRelationUriByDoi(doi));
        plosData.setUri(PlosUtil.DOI_PREFIX + doi);
        plosData.setAcknowledgements(acknowledgement);
        plosData.setAuthorContributions(contributions);
        if (!DocumentProcessorUtil.isEmptyString(citation)) {
            try {
                plosData.setIsPartOfSeries(getIsPartOfSeriesByCitation(citation));
            } catch (NoMatchingRegularExpressionException ex) {
                logger.error("Cannot get the string of IsPartOfSeries!", ex);
            }
        }
        plosData.setCitation(citation);
        JournalType type = plosData.getJournalType();

        switch (type) {
        case PLOSONE:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PONE);
            plosData.setPublisher("PLos One");
            break;
        case PLOSBIO:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PBIO);
            plosData.setPublisher("PLos Biology");
            break;
        case PLOSGEN:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PGEN);
            plosData.setPublisher("PLos Genetics");
            break;
        case PLOSMED:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PMED);
            plosData.setPublisher("PLOS Medicine");
            break;
        case PLOSCBI:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PCBI);
            plosData.setPublisher("PLOS Computational Biology");
            break;
        case PLOSPAT:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PPAT);
            plosData.setPublisher("PLoS Pathogens");
            break;
        case PLOSNTD:
            plosData.setPeerReviewNotes(PlosUtil.PEERREVIEWNOTES_PNTD);
            plosData.setPublisher("PLoS Neglected Tropical Diseases");
            break;
        default: {
            try {
                throw new NotFoundPublisherException("Journal type is undefined!");
            } catch (NotFoundPublisherException ex) {
                logger.error(ex);
            }
        }
            break;
        }

        Iterator it = metaData.entrySet().iterator();

        try {
            while (it.hasNext()) {
                Map.Entry pairs = (Map.Entry) it.next();
                if (pairs.getKey().equals("citation_title") || pairs.getKey().equals("og:title")) {
                    plosData.setTitle(pairs.getValue().toString().replaceAll("(\\[|\\])*", ""));
                } else if (pairs.getKey().equals("twitter:description")
                        || pairs.getKey().equals("og:description")) {
                    plosData.setAbstractText(pairs.getValue().toString().replaceAll("(\\[|\\])*", ""));
                } else if (pairs.getKey().equals("citation_date")) {
                    Date date = null;
                    String dateString = pairs.getValue().toString().replaceAll("(\\[|\\])*", "");
                    matchCitationDate(dateString, plosData);
                } else if (pairs.getKey().equals("citation_author")) {
                    plosData.setAuthors(pairs.getValue().toString().replaceAll("(\\[|\\])*", "").split(", "));
                } else if (pairs.getKey().equals("keywords")) {
                    plosData.setSubjects(pairs.getValue().toString().replaceAll("(\\[|\\])*", "").split(", "));
                }
                it.remove(); // avoids a ConcurrentModificationException
            }
            if (null == plosData.getSubjects()) {
                String[] subjects = PlosUtil.getSubjects(doiDataVal);
                plosData.setSubjects(subjects);
            }
            if (null == plosData.getTitle() || "".equals(plosData.getTitle())) {
                plosData.setTitle(PlosUtil.getTitleFromHtml(doiDataVal));
            }
            // download the PDF full text
            String articleOutputFolderPath = getArticleOutputFolderPath(doi);
            req.downloadPlosOnePdfByDoi(doi, articleOutputFolderPath);
            PlosUtil.createContentFile(articleOutputFolderPath + File.separator + "contents",
                    doi.split("/")[1] + ".pdf");
            plosData.exportXmlByDoiData(articleOutputFolderPath + File.separator + "dublin_core.xml");
            String outputFolderPath = DocumentProcessorUtil.getFileContainerPath(articleOutputFolderPath);
            DspaceJournalDataUtil.packLoadingData(outputFolderPath, "plos");
        } catch (Exception ex) {
            System.out.print("The data processing from doiData to plosData is wrong!\n");
            ex.printStackTrace();
        }
        return plosData;
    }
}