hu.petabyte.redflags.engine.gear.parser.DocFamilyFetcher.java Source code

Java tutorial

Introduction

Here is the source code for hu.petabyte.redflags.engine.gear.parser.DocFamilyFetcher.java

Source

/*
   Copyright 2014-2016 PetaByte Research Ltd.
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
 */
package hu.petabyte.redflags.engine.gear.parser;

import static com.google.common.base.Preconditions.checkNotNull;
import hu.petabyte.redflags.engine.gear.AbstractGear;
import hu.petabyte.redflags.engine.model.DisplayLanguage;
import hu.petabyte.redflags.engine.model.Notice;
import hu.petabyte.redflags.engine.model.NoticeID;
import hu.petabyte.redflags.engine.model.Tab;
import hu.petabyte.redflags.engine.tedintf.TedInterfaceHolder;
import hu.petabyte.redflags.engine.tedintf.TedResponse;

import java.text.SimpleDateFormat;
import java.util.Date;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

/**
 * @author Zsolt Jurnyi
 */
@Component
public class DocFamilyFetcher extends AbstractGear {

    // TODO AVOID INFINITE LOOPS - AVOID STARTING NEW SUB SESSION WITH THIS GEAR

    protected static final Logger LOG = LoggerFactory.getLogger(DocFamilyFetcher.class);
    protected static final String DATE_FORMAT = "dd-MM-yyyy";

    protected @Value("${redflags.engine.gear.parse.lang:EN}") String rawLang;
    protected DisplayLanguage lang;
    protected @Autowired TedInterfaceHolder ted;

    @Override
    public void beforeSession() throws Exception {
        lang = DisplayLanguage.valueOf(checkNotNull(rawLang));
        LOG.debug("Parsing language is {}", lang);
    }

    private void determineDocFamilyId(Notice notice) {
        NoticeID docFamilyId = notice.getId();
        for (Notice n : notice.getFamilyMembers()) {
            NoticeID memberId = n.getId();
            if (memberId.compareTo(docFamilyId) < 0) {
                docFamilyId = memberId;
            }
        }
        notice.setDocumentFamilyId(docFamilyId);
        for (Notice m : notice.getFamilyMembers()) {
            m.setDocumentFamilyId(docFamilyId);
        }
        LOG.debug("Document family ID for notice {} is {}", notice.getId(), docFamilyId);
    }

    public Notice parseDocFamilyTab(Notice notice, Document docFamilyTab) {
        for (Element memberTable : docFamilyTab.select("table.family")) {
            try {
                NoticeID memberId = new NoticeID(
                        memberTable.select("thead a[href~=TED:NOTICE:]").first().text().split(":", 2)[0]);
                if (!notice.getId().equals(memberId)) {
                    Notice memberNotice = new Notice(memberId);

                    Elements tds = memberTable.select("tbody tr").first().select("td.bgGreen");

                    String rawMemberPubDate = tds.get(0).text();
                    Date memberPubDate = new SimpleDateFormat(DATE_FORMAT).parse(rawMemberPubDate);
                    memberNotice.getData().setPublicationDate(memberPubDate);

                    if (tds.size() > 1) {
                        String rawMemberDeadline = tds.get(1).text();
                        Date memberDeadline = new SimpleDateFormat(DATE_FORMAT).parse(rawMemberDeadline);
                        memberNotice.getData().setDeadline(memberDeadline);
                    }

                    notice.getFamilyMembers().add(memberNotice);
                    LOG.trace("{} Member found: {} - {} (deadline: {})", notice.getId(), memberId, memberPubDate,
                            memberNotice.getData().getDeadline());
                }
            } catch (Exception e) {
                LOG.warn("Cannot parse a document family member of notice " + notice.getId(), e);
            }
        }
        return notice;
    }

    @Override
    protected Notice processImpl(Notice notice) throws Exception {
        TedResponse r = ted.get().requestNoticeTabQuietly(notice.getId(), lang, Tab.DATA);
        if (null != r) {
            Document dataTab = r.getParsedDocument();
            if (!dataTab.select("a[href~=tabId=4").isEmpty()) {
                TedResponse r2 = ted.get().requestNoticeTabQuietly(notice.getId(), lang, Tab.DOCUMENT_FAMILY);
                if (null != r2) {
                    Document docFamilyTab = r2.getParsedDocument();
                    notice = parseDocFamilyTab(notice, docFamilyTab);
                }
            }
        }

        determineDocFamilyId(notice);

        return notice;
    }

}