mServer.crawler.sender.MediathekKika.java Source code

Java tutorial

Introduction

Here is the source code for mServer.crawler.sender.MediathekKika.java

Source

/*
 * MediathekView
 * Copyright (C) 2008 W. Xaver
 * W.Xaver[at]googlemail.com
 * http://zdfmediathk.sourceforge.net/
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
package mServer.crawler.sender;

import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;

import org.apache.commons.lang3.time.FastDateFormat;

import de.mediathekview.mlib.Config;
import de.mediathekview.mlib.Const;
import de.mediathekview.mlib.daten.DatenFilm;
import de.mediathekview.mlib.tool.Log;
import de.mediathekview.mlib.tool.MSStringBuilder;
import etm.core.configuration.EtmManager;
import etm.core.monitor.EtmPoint;
import mServer.crawler.CrawlerTool;
import mServer.crawler.FilmeSuchen;
import mServer.crawler.GetUrl;
import mServer.tool.MserverDaten;

public class MediathekKika extends MediathekReader {

    public final static String SENDERNAME = Const.KIKA;
    private final HashSetUrl listeAllVideos = new HashSetUrl();
    private MSStringBuilder seite = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);

    public MediathekKika(FilmeSuchen ssearch, int startPrio) {
        super(ssearch, SENDERNAME, 16, /* urlWarten */ 50, startPrio);
        setName("MediathekKiKa");
    }

    @Override
    protected void addToList() {

        meldungStart();
        if (CrawlerTool.loadLongMax()) {
            addToListNormal();
        }
        addToListAllVideo();

        if (Config.getStop()) {
            meldungThreadUndFertig();
        } else if (listeThemen.isEmpty() && listeAllVideos.isEmpty()) {
            meldungThreadUndFertig();
        } else {
            // dann den Sender aus der alten Liste lschen
            // URLs laufen nur begrenzte Zeit
            // delSenderInAlterListe(SENDERNAME); brauchts wohl nicht mehr
            meldungAddMax(listeThemen.size() + listeAllVideos.size());
            for (int t = 0; t <= getMaxThreadLaufen(); ++t) {
                Thread th = new ThemaLaden();
                th.setName(SENDERNAME + t);
                th.start();
            }
        }
    }

    private void addToListNormal() {
        EtmPoint performancePoint = EtmManager.getEtmMonitor().createPoint("MediathekKiKa.addToListNormal");

        final String ADRESSE = "http://www.kika.de/sendungen/sendungenabisz100.html";
        final String MUSTER_URL = "<a href=\"/sendungen/sendungenabisz100_";
        ArrayList<String> liste1 = new ArrayList<>();
        ArrayList<String> liste2 = new ArrayList<>();

        listeThemen.clear();
        try {
            GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
            seite = getUrl.getUri(SENDERNAME, ADRESSE, StandardCharsets.UTF_8, 3, seite, "KiKA: Startseite");
            seite.extractList("", "", MUSTER_URL, "\"", "http://www.kika.de/sendungen/sendungenabisz100_", liste1);

            for (String s : liste1) {
                seite = getUrl.getUri_Utf(getSendername(), s, seite, "KiKa-Sendungen");
                final String MUSTER_SENDUNGEN_1 = "<h4 class=\"headline\">";
                final String MUSTER_SENDUNGEN_2 = "<a href=\"/";
                seite.extractList("", "<!--The bottom navigation -->", MUSTER_SENDUNGEN_1, MUSTER_SENDUNGEN_2, "\"",
                        "http://www.kika.de/", liste2);
            }

            for (String ss : liste2) {
                listeThemen.add(new String[] { ss });
            }
        } catch (Exception ex) {
            Log.errorLog(302025469, ex);
        }
        performancePoint.collect();
    }

    private void addToListAllVideo() {
        EtmPoint performancePoint = EtmManager.getEtmMonitor().createPoint("MediathekKiKa.addToListAllVideo");

        final String ADRESSE = "http://www.kika.de/videos/allevideos/allevideos-buendelgruppen100.html";
        final String MUSTER_URL = "<a href=\"/videos/allevideos/allevideos-buendelgruppen100_page-";
        ArrayList<String> liste1 = new ArrayList<>();
        ArrayList<String> liste2 = new ArrayList<>();

        try {
            GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
            seite = getUrl.getUri(SENDERNAME, ADRESSE, StandardCharsets.UTF_8, 3, seite,
                    "KiKA: Startseite alle Videos");
            seite.extractList("", "", MUSTER_URL, "\"",
                    "http://www.kika.de/videos/allevideos/allevideos-buendelgruppen100_page-", liste1);
            for (String s1 : liste1) {
                seite = getUrl.getUri_Utf(getSendername(), s1, seite, "KiKa-Sendungen");
                seite.extractList("", "", "<div class=\"media mediaA\">\n<a href=\"/", "\"", "http://www.kika.de/",
                        liste2);
            }
            for (String s2 : liste2) {
                listeAllVideos.add(new String[] { s2 });
            }
        } catch (Exception ex) {
            Log.errorLog(732120256, ex);
        }
        performancePoint.collect();
    }

    private class ThemaLaden extends Thread {
        private final ArrayList<String> liste1 = new ArrayList<>();
        private final ArrayList<String> liste2 = new ArrayList<>();
        private MSStringBuilder seite1 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        private MSStringBuilder seite2 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        private MSStringBuilder seite3 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);

        @Override
        public void run() {
            EtmPoint performancePoint = EtmManager.getEtmMonitor().createPoint("MediathekKiKa::ThemaLaden.run");

            try {
                meldungAddThread();
                String[] link;
                while (!Config.getStop() && (link = listeAllVideos.getListeThemen()) != null) {
                    meldungProgress(link[0]);
                    loadAllVideo_1(link[0] /* url */);
                }
                while (!Config.getStop() && (link = listeThemen.getListeThemen()) != null) {
                    meldungProgress(link[0]);
                    ladenSerien_1(link[0] /* url */);
                }
            } catch (Exception ex) {
                Log.errorLog(915236791, ex);
            }
            meldungThreadUndFertig();
            performancePoint.collect();
        }

        private void ladenSerien_1(String filmWebsite) {
            EtmPoint performancePoint = EtmManager.getEtmMonitor()
                    .createPoint("MediathekKiKa::ThemaLaden.ladenSerien_1");

            try {
                liste1.clear();
                liste2.clear();
                GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
                seite1 = getUrl.getUri(SENDERNAME, filmWebsite, StandardCharsets.UTF_8, 1, seite1, "Themenseite");
                String thema = seite1.extract("<title>", "<");
                thema = thema.replace("KiKA -", "").trim();

                String url = "";
                if (url.isEmpty()) {
                    url = seite1.extract("<h2 class=\"conHeadline\">Alle Folgen</h2>", "<a href=\"", "\"");
                }
                if (url.isEmpty()) {
                    url = seite1.extract("<h2 class=\"conHeadline\">Alle Sendungen</h2>", "<a href=\"", "\"");
                }
                if (url.isEmpty()) {
                    int p = seite1.indexOf("<h2 class=\"conHeadline\">Nchste Folge</h2>");
                    if (p <= 0) {
                        p = 0;
                    }
                    url = seite1.extract("<span class=\"moreBtn\">", "<a href=\"", "\"", p, 0, "");
                }
                if (url.isEmpty()) {
                    Log.errorLog(721356987, "keine URL: " + filmWebsite);
                    return;
                } else {
                    if (!url.startsWith("http://www.kika.de")) {
                        url = "http://www.kika.de" + url;
                    }
                    seite1 = getUrl.getUri(SENDERNAME, url, StandardCharsets.UTF_8, 1, seite1, "Themenseite");
                    seite1.extractList("", "<!--The bottom navigation -->", "<div class=\"shortInfos\">",
                            "<a href=\"", "\"", "http://www.kika.de", liste1);

                    seite1.extractList("", "", "<div class=\"bundleNaviItem \">", "<a href=\"", "\"",
                            "http://www.kika.de", liste2);
                    for (String s : liste2) {
                        seite1 = getUrl.getUri(SENDERNAME, s, StandardCharsets.UTF_8, 1, seite1, "Themenseite");
                        seite1.extractList("", "<!--The bottom navigation -->", "<div class=\"shortInfos\">",
                                "<a href=\"", "\"", "http://www.kika.de", liste1);
                    }
                    if (liste1.isEmpty()) {
                        Log.errorLog(794512630, "keine Filme: " + filmWebsite);
                        return;
                    }
                    int count = 0;
                    int err = 0;
                    for (int i = (liste1.size() - 1); i >= 0; --i) {
                        // die jngsten Beitrge sind am Ende
                        String s = liste1.get(i);
                        ++count;
                        if (!CrawlerTool.loadLongMax() && count > 4) {
                            return;
                        }
                        if (Config.getStop()) {
                            return;
                        }
                        if (!ladenSerien_2(s, thema)) {
                            //dann gibts evtl. nix mehr
                            if (!CrawlerTool.loadLongMax()) {
                                // nur beim kurzen Suchen
                                ++err;
                                if (err > 2) {
                                    //bei ein paar sind Beitrge in der Zukunft angeknndigt
                                    break;
                                }
                            }
                        } else {
                            err = 0;
                        }

                    }
                }
            } catch (Exception ex) {
                Log.errorLog(915263147, ex);
            }
            performancePoint.collect();
        }

        private boolean ladenSerien_2(String filmWebsite, String thema) {
            EtmPoint performancePoint = EtmManager.getEtmMonitor()
                    .createPoint("MediathekKiKa::ThemaLaden.ladenSerien_2");

            boolean ret = false;
            try {
                meldung(filmWebsite);
                GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
                seite1 = getUrl.getUri(SENDERNAME, filmWebsite, StandardCharsets.UTF_8, 1, seite1, "Themenseite");

                String xml = seite1.extract("<div class=\"av-playerContainer\"", "setup({dataURL:'", "'");
                if (!xml.isEmpty()) {
                    ret = true;
                    ladenXml(xml, thema, false /*alle*/);
                }
            } catch (Exception ex) {
                Log.errorLog(801202145, ex);
            }
            performancePoint.collect();
            return ret;
        }

        private void loadAllVideo_1(String url) {
            EtmPoint performancePoint = EtmManager.getEtmMonitor()
                    .createPoint("MediathekKiKa::ThemaLaden.loadAllVideo_1");

            ArrayList<String> liste = new ArrayList<>();
            try {
                GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
                seite2 = getUrl.getUri(getSendername(), url, StandardCharsets.UTF_8, 1, seite2, "KiKa-Sendungen");
                loadAllVideo_2(seite2);
                if (CrawlerTool.loadLongMax()) {
                    seite2.extractList("", "",
                            "<div class=\"bundleNaviItem active\">\n<a href=\"/videos/allevideos/", "\"",
                            "http://www.kika.de/videos/allevideos/", liste);
                    seite2.extractList("", "", "<div class=\"bundleNaviItem \">\n<a href=\"/videos/allevideos/",
                            "\"", "http://www.kika.de/videos/allevideos/", liste);
                }
                for (String u : liste) {
                    if (Config.getStop()) {
                        break;
                    }
                    seite2 = getUrl.getUri(getSendername(), u, StandardCharsets.UTF_8, 1, seite2, "KiKa-Sendungen");
                    loadAllVideo_2(seite2);
                }
            } catch (Exception ex) {
                Log.errorLog(825412369, ex);
            }
            performancePoint.collect();
        }

        private void loadAllVideo_2(MSStringBuilder sStringBuilder) {
            EtmPoint performancePoint = EtmManager.getEtmMonitor()
                    .createPoint("MediathekKiKa::ThemaLaden.loadAllVideo_2");

            ArrayList<String> liste = new ArrayList<>();

            try {
                String thema = sStringBuilder.extract("<h1 class=\"headline\">", "<").trim();
                if (thema.isEmpty()) {
                    thema = sStringBuilder.extract("<title>KiKA -", "<").trim();
                }

                sStringBuilder.extractList(".setup({dataURL:'", "'", liste);
                for (String s : liste) {
                    if (Config.getStop()) {
                        break;
                    }
                    ladenXml(s /* url */, thema, true /*nur neue URLs*/);
                }
            } catch (Exception ex) {
                Log.errorLog(201036987, ex);
            }
            performancePoint.collect();
        }

        private void ladenXml(String xmlWebsite, String thema, boolean urlPruefen) {
            EtmPoint performancePoint = EtmManager.getEtmMonitor()
                    .createPoint("MediathekKiKa::ThemaLaden.ladenXml");

            try {
                GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
                seite3 = getUrl.getUri(getSendername(), xmlWebsite, StandardCharsets.UTF_8, 1, seite3, "");
                if (thema.isEmpty()) {
                    thema = getSendername();
                }
                // manuelle Anpassung, Notlsung!!
                if (thema.equals("ABC-Br")) {
                    thema = "ABC Br";
                }

                String titel = seite3.extract("<title>", "<");
                if (titel.toLowerCase().equals(thema.toLowerCase())) {
                    titel = seite3.extract("<headline>", "<");
                }
                if (titel.toLowerCase().equals(thema.toLowerCase())) {
                    titel = seite3.extract("<topline>", "<");
                    if (titel.isEmpty()) {
                        // dann bleibts dabei
                        titel = seite3.extract("<title>", "<");
                    }
                }
                String beschreibung = seite3.extract("<broadcastDescription>", "<");
                String date = seite3.extract("<broadcastDate>", "<");
                String datum = "";
                String zeit = "";
                if (!date.isEmpty()) {
                    datum = convertDatum(date);
                    zeit = convertTime(date);
                } else {
                    date = seite3.extract("<webTime>", "<"); // <webTime>08.12.2014 13:16</webTime>
                    if (!date.isEmpty()) {
                        datum = date.substring(0, date.indexOf(' ')).trim();
                        zeit = date.substring(date.indexOf(' ')).trim() + ":00";
                    }
                }
                String urlSendung = seite3.extract("<broadcastURL>", "<");
                if (urlSendung.isEmpty()) {
                    urlSendung = seite3.extract("<htmlUrl>", "<");
                }
                long duration = 0;
                long runtime = 0;
                try {
                    //<duration>00:03:07</duration>
                    String dauer = seite3.extract("<duration>", "<");
                    if (!dauer.isEmpty()) {
                        String[] parts = dauer.split(":");
                        long power = 1;
                        for (int i = parts.length - 1; i >= 0; i--) {
                            duration += Long.parseLong(parts[i]) * power;
                            power *= 60;
                        }
                    }
                } catch (NumberFormatException ex) {
                    if (MserverDaten.debug)
                        Log.errorLog(201036547, ex, xmlWebsite);
                }
                // Film-URLs suchen
                final String MUSTER_URL_MP4 = "<progressiveDownloadUrl>";
                String urlHD = seite3.extract("| MP4 Web XL |", MUSTER_URL_MP4, "<");
                String urlMp4 = seite3.extract("| MP4 Web L |", MUSTER_URL_MP4, "<");
                if (urlMp4.isEmpty()) {
                    urlMp4 = seite3.extract("| MP4 Web L+ |", MUSTER_URL_MP4, "<");
                }
                String urlMp4_klein = seite3.extract("| MP4 Web M |", MUSTER_URL_MP4, "<");

                if (urlMp4.isEmpty()) {
                    urlMp4 = urlMp4_klein;
                    urlMp4_klein = "";
                }

                if (thema.isEmpty() || urlSendung.isEmpty() || titel.isEmpty() || urlMp4.isEmpty() || date.isEmpty()
                        || zeit.isEmpty() || duration == 0 /*|| beschreibung.isEmpty()*/) {
                    Log.errorLog(735216987, "leer: " + xmlWebsite);
                }

                if (!urlMp4.isEmpty()) {
                    meldung(urlMp4);
                    DatenFilm film = new DatenFilm(SENDERNAME, thema, urlSendung, titel, urlMp4, ""/*rtmpUrl*/,
                            datum, zeit, duration, beschreibung);
                    CrawlerTool.addUrlKlein(film, urlMp4_klein, "");
                    CrawlerTool.addUrlHd(film, urlHD, "");
                    addFilm(film, urlPruefen);
                } else {
                    Log.errorLog(963215478, " xml: " + xmlWebsite);
                }
            } catch (Exception ex) {
                Log.errorLog(784512365, ex);
            }
            performancePoint.collect();
        }

        private final FastDateFormat sdf = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSZ");

        private String convertDatum(String datum) {
            //<broadcastDate>2014-12-12T09:45:00.000+0100</broadcastDate>
            try {
                FastDateFormat sdfOutDay = FastDateFormat.getInstance("dd.MM.yyyy");

                Date filmDate = sdf.parse(datum);
                datum = sdfOutDay.format(filmDate);
            } catch (ParseException ex) {
                Log.errorLog(731025789, ex, "Datum: " + datum);
            }
            return datum;
        }

        private String convertTime(String zeit) {
            //<broadcastDate>2014-12-12T09:45:00.000+0100</broadcastDate>
            try {
                FastDateFormat sdfOutTime = FastDateFormat.getInstance("HH:mm:ss");

                Date filmDate = sdf.parse(zeit);
                zeit = sdfOutTime.format(filmDate);
            } catch (ParseException ex) {
                Log.errorLog(915423687, ex, "Time: " + zeit);
            }
            return zeit;
        }
    }

}