mServer.crawler.sender.MediathekNdr.java Source code

Java tutorial

Introduction

Here is the source code for mServer.crawler.sender.MediathekNdr.java

Source

/*    
 *    MediathekView
 *    Copyright (C) 2008   W. Xaver
 *    W.Xaver[at]googlemail.com
 *    http://zdfmediathk.sourceforge.net/
 *    
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package mServer.crawler.sender;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Date;

import org.apache.commons.lang3.time.FastDateFormat;

import de.mediathekview.mlib.Config;
import de.mediathekview.mlib.Const;
import de.mediathekview.mlib.daten.DatenFilm;
import de.mediathekview.mlib.tool.Log;
import de.mediathekview.mlib.tool.MSStringBuilder;
import mServer.crawler.CrawlerTool;
import mServer.crawler.FilmeSuchen;
import mServer.crawler.GetUrl;
import mServer.tool.MserverDaten;

public class MediathekNdr extends MediathekReader implements Runnable {

    public final static String SENDERNAME = Const.NDR;
    private MSStringBuilder seiteAlle = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);

    public MediathekNdr(FilmeSuchen ssearch, int startPrio) {
        super(ssearch, SENDERNAME, /* threads */ 2, /* urlWarten */ 50, startPrio);
    }

    //-> erste Seite:
    // <h5><a href="/mediathek/mediatheksuche103_broadcast-30.html">Nordmagazin</a></h5>
    @Override
    protected void addToList() {
        //<broadcast id="1391" site="ndrfernsehen">45 Min</broadcast>
        final String ADRESSE = "http://www.ndr.de/mediathek/sendungen_a-z/index.html";
        final String MUSTER_URL1 = "<li><a href=\"/mediathek/mediatheksuche105_broadcast-";
        listeThemen.clear();

        meldungStart();

        MSStringBuilder seite = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        final GetUrl getUrlIo = new GetUrl(getWartenSeiteLaden());
        seite = getUrlIo.getUri(SENDERNAME, ADRESSE, StandardCharsets.UTF_8, 5 /* versuche */, seite,
                ""/* meldung */);
        int pos = 0;
        int pos1;
        int pos2;
        String url = "";
        String thema = "";
        while ((pos = seite.indexOf(MUSTER_URL1, pos)) != -1) {
            try {
                pos += MUSTER_URL1.length();
                pos1 = pos;
                if ((pos2 = seite.indexOf("\"", pos)) != -1) {
                    url = seite.substring(pos1, pos2);
                }
                pos1 = seite.indexOf(">", pos);
                pos2 = seite.indexOf("<", pos);
                if (pos1 != -1 && pos2 != -1 && pos1 < pos2) {
                    thema = seite.substring(pos1 + 1, pos2);
                }
                if (url.isEmpty()) {
                    Log.errorLog(210367600, "keine Url");
                    continue;
                }
                String url_ = "http://www.ndr.de/mediathek/mediatheksuche105_broadcast-" + url;
                String[] add = new String[] { url_, thema };
                if (CrawlerTool.loadLongMax()) {
                    if (!alleSeiteSuchen(url_, thema)) {
                        // dann halt so versuchen
                        listeThemen.addUrl(add);
                    }
                } else {
                    listeThemen.addUrl(add);
                }
            } catch (Exception ex) {
                Log.errorLog(332945670, ex);
            }
        }
        // noch "Verpasst" fr die letzten Tage einfgen
        // http://www.ndr.de/mediathek/sendung_verpasst/epg1490_date-2014-05-17.html
        // http://www.ndr.de/mediathek/sendung_verpasst/epg1490_date-2014-05-17_display-onlyvideo.html
        FastDateFormat formatter1 = FastDateFormat.getInstance("yyyy-MM-dd");
        FastDateFormat formatter2 = FastDateFormat.getInstance("dd.MM.yyyy");
        final int maxTage = CrawlerTool.loadLongMax() ? 30 : 20;
        for (int i = 0; i < maxTage; ++i) {
            // https://www.ndr.de/mediathek/sendung_verpasst/epg1490_date-2015-09-05_display-all.html
            final String URL = "http://www.ndr.de/mediathek/sendung_verpasst/epg1490_date-";
            final String tag = formatter1.format(new Date().getTime() - (1000 * 60 * 60 * 24 * i));
            final String date = formatter2.format(new Date().getTime() - (1000 * 60 * 60 * 24 * i));
            //String urlString = URL + tag + "_display-onlyvideo.html"; --> stimmt leider nicht immer
            final String urlString = URL + tag + "_display-all.html";
            listeThemen.addUrl(new String[] { urlString, date });
        }

        if (Config.getStop()) {
            meldungThreadUndFertig();
        } else if (listeThemen.isEmpty()) {
            meldungThreadUndFertig();
        } else {
            meldungAddMax(listeThemen.size());
            for (int t = 0; t < getMaxThreadLaufen(); ++t) {
                Thread th = new ThemaLaden();
                th.setName(SENDERNAME + t);
                th.start();
            }
        }
    }

    private boolean alleSeiteSuchen(String strUrlFeed, String tthema) {
        boolean ret = false;
        GetUrl getUrlIo = new GetUrl(getWartenSeiteLaden());
        seiteAlle = getUrlIo.getUri(SENDERNAME, strUrlFeed, StandardCharsets.UTF_8, 3 /* versuche */, seiteAlle,
                "Thema: " + tthema/* meldung */);
        int pos1 = 0, pos2, anz1, anz2 = 0;
        try {
            // <a class="square button" href="/mediathek/mediatheksuche105_broadcast-1391_page-5.html" title="Zeige Seite 5">
            // http://www.ndr.de/mediathek/mediatheksuche105_broadcast-30_page-1.html
            final String WEITER = " title=\"Zeige Seite ";
            while ((pos1 = seiteAlle.indexOf(WEITER, pos1)) != -1) {
                pos1 += WEITER.length();
                if ((pos2 = seiteAlle.indexOf("\"", pos1)) != -1) {
                    String anz = seiteAlle.substring(pos1, pos2);
                    try {
                        anz1 = Integer.parseInt(anz);
                        if (anz2 < anz1) {
                            anz2 = anz1;
                        }
                    } catch (Exception ex) {
                        Log.errorLog(643208979, strUrlFeed);
                    }
                }
            }
            for (int i = 2; i <= anz2 && i <= 10; ++i) {
                // geht bei 2 los da das ja schon die erste Seite ist!
                //das:   http://www.ndr.de/mediathek/mediatheksuche105_broadcast-30.html
                // wird: http://www.ndr.de/mediathek/mediatheksuche105_broadcast-30_page-3.html
                String url_ = strUrlFeed.replace(".html", "_page-" + i + ".html");
                listeThemen.addUrl(new String[] { url_, tthema });
                ret = true;
            }
        } catch (Exception ex) {
            Log.errorLog(913047821, strUrlFeed);
        }
        return ret;
    }

    private class ThemaLaden extends Thread {

        private final GetUrl getUrl = new GetUrl(getWartenSeiteLaden());
        private MSStringBuilder seite1 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        private MSStringBuilder seite2 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        private MSStringBuilder seite3 = new MSStringBuilder(Const.STRING_BUFFER_START_BUFFER);
        private final ArrayList<String> liste = new ArrayList<>();

        @Override
        public void run() {
            try {
                meldungAddThread();
                String[] link;
                while (!Config.getStop() && (link = listeThemen.getListeThemen()) != null) {
                    try {
                        meldungProgress(link[1]);
                        feedEinerSeiteSuchen(link[0], link[1] /* thema */);
                    } catch (Exception ex) {
                        Log.errorLog(336901211, ex);
                    }
                }
            } catch (Exception ex) {
                Log.errorLog(554632590, ex);
            }
            meldungThreadUndFertig();
        }

        private long convertDuration(final String duration, final String strUrlFeed) {
            long durationInSeconds = 0;
            try {
                if (!duration.isEmpty()) {
                    final String[] parts = duration.split(":");
                    long power = 1;
                    durationInSeconds = 0;
                    for (int i = parts.length - 1; i >= 0; i--) {
                        durationInSeconds += Long.parseLong(parts[i]) * power;
                        power *= 60;
                    }
                }
            } catch (NumberFormatException ex) {
                if (MserverDaten.debug)
                    Log.errorLog(369015497, ex, strUrlFeed);
            } catch (Exception ex) {
                Log.errorLog(369015497, ex, strUrlFeed);
            }

            return durationInSeconds;
        }

        private void feedEinerSeiteSuchen(String strUrlFeed, String tthema) {
            final String MUSTER_URL = "<a href=\"";
            GetUrl getUrlIo = new GetUrl(getWartenSeiteLaden());
            seite1 = getUrlIo.getUri(SENDERNAME, strUrlFeed, StandardCharsets.UTF_8, 3 /* versuche */, seite1,
                    "Thema: " + tthema/* meldung */);
            int pos = 0;
            String url;
            String titel;
            String thema = tthema;
            String datum = "";
            String zeit = "";
            long durationInSeconds;
            String tmp;
            boolean tage = false;
            try {
                meldung(strUrlFeed);
                String muster;
                if (seite1.indexOf("<strong class=\"time\">") != -1) {
                    muster = "<strong class=\"time\">";
                    tage = true;
                } else {
                    muster = "<span class=\"icon icon_video\"></span>";
                }
                while (!Config.getStop() && (pos = seite1.indexOf(muster, pos)) != -1) {
                    pos += muster.length();
                    url = seite1.extract(MUSTER_URL, "\"", pos);
                    if (url.isEmpty()) {
                        Log.errorLog(659210274, "keine Url feedEinerSeiteSuchen" + strUrlFeed);
                        continue;
                    }
                    if (!url.startsWith("http")) {
                        url = "http://www.ndr.de" + url;
                    }
                    if (tage) {
                        // <h3><a href="/fernsehen/epg/import/Rote-Rosen,sendung64120.html" title="Rote Rosen"  >Rote Rosen (1725)</a></h3>
                        thema = seite1.extract(MUSTER_URL, " title=\"", "\"", pos, 0, "");
                        titel = seite1.extract(MUSTER_URL, ">", "<", pos, 0, "");
                        if (titel.contains("(Wdh.)")) {
                            // dann sollte der Beitrag schon in der Liste sein
                            continue;
                        }
                        if (thema.equals(titel) && thema.contains(" - ")) {
                            thema = thema.substring(0, thema.indexOf(" - ")).trim();
                            titel = titel.substring(titel.indexOf(" - "));
                            titel = titel.replace(" - ", "").trim();
                        }
                    } else {
                        titel = seite1.extract(" title=\"", "\"", pos);
                        titel = titel.replace("Zum Video:", "").trim();
                    }
                    if (tage) {
                        tmp = seite1.substring(pos, seite1.indexOf("<", pos));
                        datum = tthema;
                        try {
                            Date filmDate = FastDateFormat.getInstance("HH:mm").parse(tmp);
                            zeit = FastDateFormat.getInstance("HH:mm:ss").format(filmDate);
                        } catch (Exception ex) {
                            Log.errorLog(795623017, "convertDatum: " + strUrlFeed);
                        }
                    } else {
                        tmp = seite1.extract("<div class=\"subline date\">", "<", pos);
                        String[] dateValues = parseDateTime(tmp, strUrlFeed);
                        datum = dateValues[0];
                        zeit = dateValues[1];
                    }
                    if (tage) {
                        //<span class="icon icon_video" aria-label="L&auml;nge"></span>29:59</div>
                        String duration = seite1.extract("\"L&auml;nge\"></span>", "<", pos).trim();
                        durationInSeconds = convertDuration(duration, strUrlFeed);
                    } else {
                        String duration = seite1.extract("Video (", ")", pos);
                        duration = duration.replace("min", "").trim();
                        durationInSeconds = convertDuration(duration, strUrlFeed);
                    }
                    filmSuchen_1(strUrlFeed, thema, titel, url, datum, zeit, durationInSeconds, tage);
                }
            } catch (Exception ex) {
                Log.errorLog(693219870, strUrlFeed);
            }
        }

        private String[] parseDateTime(String dateTimeValue, String strUrlFeed) {
            String[] dateValues = new String[2];
            dateValues[0] = "";
            dateValues[1] = "";

            String dateTime = dateTimeValue.replace("Uhr", "").trim();

            if (!dateTime.isEmpty()) {
                try {
                    Date filmDate = FastDateFormat.getInstance("dd.MM.yyyy HH:mm").parse(dateTime);
                    dateValues[0] = FastDateFormat.getInstance("dd.MM.yyyy").format(filmDate);
                    dateValues[1] = FastDateFormat.getInstance("HH:mm:ss").format(filmDate);
                } catch (Exception ex) {
                    Log.errorLog(623657941, "convertDatum: " + strUrlFeed);
                }
            }

            return dateValues;
        }

        private void filmSuchen_1(String strUrlThema, String thema, String titel, String filmWebsite, String datum,
                String zeit, long durationInSeconds, boolean onlyUrl) {
            //playlist: [
            //{
            //1: {src:'http://hds.ndr.de/z/2013/0419/TV-20130419-1010-0801.,hi,hq,.mp4.csmil/manifest.f4m', type:"application/f4m+xml"},
            //2: {src:'http://hls.ndr.de/i/2013/0419/TV-20130419-1010-0801.,lo,hi,hq,.mp4.csmil/master.m3u8', type:"application/x-mpegURL"},
            //3: {src:'http://media.ndr.de/progressive/2013/0419/TV-20130419-1010-0801.hi.mp4', type:"video/mp4"},

            // http://media.ndr.de/progressive/2012/0820/TV-20120820-2300-0701.hi.mp4
            // rtmpt://cp160844.edgefcs.net/ondemand/mp4:flashmedia/streams/ndr/2012/0820/TV-20120820-2300-0701.hq.mp4
            final String MUSTER_URL = "itemprop=\"contentUrl\" content=\"https://mediandr-a";
            seite2 = getUrl.getUri_Utf(SENDERNAME, filmWebsite, seite2, "strUrlThema: " + strUrlThema);
            String description = extractDescription(seite2);
            //String[] keywords = extractKeywords(seite2);
            String subtitle = seite2.extract(",tracks: [{ src: \"", "\""); //,tracks: [{ src: "/fernsehen/sendungen/45_min/video-podcast/ut20448.xml", srclang:"de"}]
            if (!subtitle.isEmpty()) {
                subtitle = "http://www.ndr.de" + subtitle;
                //            } else {
                //                System.out.println("Test");
            }
            meldung(filmWebsite);
            int pos1;
            int pos2;
            String url;
            try {
                // src="/fernsehen/hallondsopplatt162-player_image-2c09ece0-0508-49bf-b4d6-afff2be2115c_theme-ndrde.html"
                // http://www.ndr.de/fernsehen/hallondsopplatt162-ppjson_image-2c09ece0-0508-49bf-b4d6-afff2be2115c.json
                // id="pp_hallondsopplatt162"
                if (datum.isEmpty()) {
                    String tmp = seite2.extract("<span itemprop=\"datePublished\"", "</");
                    if ((pos1 = tmp.indexOf(">")) != -1) {
                        tmp = tmp.substring(pos1 + 1, tmp.length());
                        String[] dateValues = parseDateTime(tmp, strUrlThema);
                        datum = dateValues[0];
                        zeit = dateValues[1];
                    }
                }

                String json = seite2.extract("-player_image-", "_");
                String pp = seite2.extract("id=\"pp_", "\"");
                if (!json.isEmpty() && !pp.isEmpty()) {
                    json = "http://www.ndr.de/fernsehen/" + pp + "-ppjson_image-" + json + ".json";
                    filmSuchen_2(strUrlThema, thema, titel, filmWebsite, json, datum, zeit, durationInSeconds,
                            description, subtitle);

                } else if ((pos1 = seite2.indexOf(MUSTER_URL)) != -1) {
                    pos1 += MUSTER_URL.length();
                    if ((pos2 = seite2.indexOf("\"", pos1)) != -1) {
                        url = seite2.substring(pos1, pos2);
                        if (!url.isEmpty()) {
                            url = "http://mediandr-a" + url;
                            if (thema.isEmpty()) {
                                thema = seite2.extract("<h1>", "<div class=\"subline\">", "<");
                                if (thema.contains("|")) {
                                    thema = thema.substring(0, thema.lastIndexOf("|"));
                                    thema = thema.trim();
                                }
                                if (thema.contains("-")) {
                                    thema = thema.substring(0, thema.lastIndexOf("-"));
                                    thema = thema.trim();
                                }
                                if (thema.contains("Uhr")) {
                                    thema = "";
                                }
                                if (thema.isEmpty()) {
                                    thema = "NDR";
                                }
                            }
                            DatenFilm film = new DatenFilm(SENDERNAME, thema, filmWebsite, titel, url,
                                    ""/*rtmpURL*/, datum, zeit, durationInSeconds, description);
                            CrawlerTool.addUrlSubtitle(film, subtitle);
                            if (url.contains(".hq.")) {
                                String urlKlein = url.replace(".hq.", ".hi.");
                                CrawlerTool.addUrlKlein(film, urlKlein, "");
                            }
                            addFilm(film, onlyUrl);
                        } else {
                            Log.errorLog(623657941, "keine URL: " + filmWebsite);
                        }
                    }

                } else {
                    Log.errorLog(915230214, "auch keine Url: " + filmWebsite);
                }
            } catch (Exception ex) {
                Log.errorLog(699830157, ex);
            }
        }

        private void filmSuchen_2(String strUrlThema, String thema, String titel, String filmWebsite, String json,
                String datum, String zeit, long durationInSeconds, String description, String subtitle) {

            seite3 = getUrl.getUri_Utf(SENDERNAME, json, seite3, "strUrlThema: " + strUrlThema);
            String url_hd = "", url_xl = "", url_m = "";
            seite3.extractList("", "", "\"src\": \"https://mediandr", "\"", "https://mediandr", liste);

            for (String s : liste) {
                if (s.endsWith(".hd.mp4")) {
                    url_hd = s;
                } else if (s.endsWith(".hq.mp4")) {
                    url_xl = s;
                } else if (s.endsWith(".hi.mp4")) {
                    url_m = s;
                }
            }
            liste.clear();
            if (url_xl.isEmpty()) {
                url_xl = url_m;
                url_m = "";
            }

            final String http = "http:";
            final String https = "https:";
            url_hd = url_hd.replaceFirst(https, http);
            url_xl = url_xl.replaceFirst(https, http);
            url_m = url_m.replaceFirst(https, http);

            if (subtitle.isEmpty()) {
                subtitle = seite3.extract("\"tracks\":", "\"/", "\"", "http://www.ndr.de/");
            }
            if (!url_xl.isEmpty()) {
                DatenFilm film = new DatenFilm(SENDERNAME, thema, filmWebsite, titel, url_xl, ""/*rtmpURL*/, datum,
                        zeit, durationInSeconds, description);
                if (!subtitle.isEmpty()) {
                    CrawlerTool.addUrlSubtitle(film, subtitle);
                }
                if (!url_hd.isEmpty()) {
                    CrawlerTool.addUrlHd(film, url_hd, "");
                }
                if (!url_m.isEmpty()) {
                    CrawlerTool.addUrlKlein(film, url_m, "");
                }
                addFilm(film);
            } else {
                Log.errorLog(915234210, "keine URL im json: " + filmWebsite);
            }
        }

        private String extractDescription(MSStringBuilder page) {
            String desc = extractString(page, "<meta property=\"og:description\" content=\"", "\"");
            if (desc == null) {
                return "";
            }
            return desc;
        }

        /*        private String[] extractKeywords(MSStringBuilder page) {
        String keywords = extractString(page, "<meta name=\"keywords\"  lang=\"de\" content=\"", "\"");
        if (keywords == null) {
            return new String[]{""};
        }
        String[] k = keywords.split(",");
        for (int i = 0; i < k.length; i++) {
            k[i] = k[i].trim();
        }
        return k;
                }*/

        private String extractString(MSStringBuilder source, String startMarker, String endMarker) {
            int start = source.indexOf(startMarker);
            if (start == -1) {
                return null;
            }
            start = start + startMarker.length();
            int end = source.indexOf(endMarker, start);
            if (end == -1) {
                return null;
            }
            return source.substring(start, end);
        }
    }
}