Java HTML Jsoup Document getCategoryIds(final Document html)

Description

get Category Ids

License

Open Source License

Declaration

static int[] getCategoryIds(final Document html)

Method Source Code

//package com.java2s;
//License from project: Open Source License 

import java.util.Optional;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import java.util.stream.Stream;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class Main {
    static int[] getCategoryIds(final Document html) {
        // Category taken from breadcrumb, which Wowhead draws with JS :(
        final String breadcrumbData = html.getElementsByTag("script").stream().map(Element::data)
                .filter(data -> data.contains("PageTemplate.set({breadcrumb:")).findFirst().get();

        // ugh, parsing JS with regexes
        final String regex = Pattern.quote("PageTemplate.set({breadcrumb: [") + "([0-9,-]+)"
                + Pattern.quote("]});");
        final String[] categoryIds = getRegexGroup(breadcrumbData, regex, 1).get().split(",");
        return Stream.of(categoryIds).mapToInt(Integer::parseInt).toArray();
    }//from   w  w w. j a  v  a 2 s  .c o m

    static Optional<String> getRegexGroup(final String str, final String regex, final int group,
            final int... flags) {
        int combinedFlags = 0;

        for (int flag : flags) {
            combinedFlags |= flag;
        }

        final Matcher matcher = Pattern.compile(regex, combinedFlags).matcher(str);

        if (!matcher.find()) {
            return Optional.empty();
        }

        return Optional.of(matcher.group(group));
    }
}

Java HTML Jsoup Document getCategoryIds(final Document html)

Description

License

Declaration

Method Source Code

Related