Java HTML Jsoup Document getInfoboxLines(final Document html, final boolean stripColor)

Here you can find the source of getInfoboxLines(final Document html, final boolean stripColor)

Description

get Infobox Lines

License

Open Source License

Declaration

static List<String> getInfoboxLines(final Document html, final boolean stripColor) 

Method Source Code

//package com.java2s;
//License from project: Open Source License 

import java.util.Collections;

import java.util.List;

import java.util.Optional;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

public class Main {
    static List<String> getInfoboxLines(final Document html, final boolean stripColor) {
        final Optional<String> infoboxData = html.getElementsByTag("script").stream().map(Element::data)
                .filter(data -> data.contains("arkup.printHtml")).findFirst();

        if (!infoboxData.isPresent()) {
            return Collections.emptyList();
        }// ww  w.j av a2 s  .  c  o m

        final String infoboxMarkup = getRegexGroup(infoboxData.get(),
                "[Mm]arkup\\.printHtml\\((['\"])(.*)\\1, 'infobox", 2).get().replace("\\/", "/"); // wowhead now escapes forward slashes

        // Convert \xNN escape sequences to their corresponding characters
        final Matcher matcher = Pattern.compile("\\\\x([0-9A-Z]{2})").matcher(infoboxMarkup);
        final StringBuffer sb = new StringBuffer();

        while (matcher.find()) {
            final String hex = matcher.group(1);
            matcher.appendReplacement(sb, Character.toString((char) Integer.parseInt(hex, 16)));
        }

        matcher.appendTail(sb);

        // We'll get BBCode, convert it to a list of plain text lines
        return Stream.of(sb.toString().split(Pattern.quote("[/li][li]")))
                .flatMap((String line) -> Stream.of(line.split(Pattern.quote("[br]"))))
                // replace [race/class/money=X] with X
                .map(line -> line.replaceAll("\\[(?:race|class|money)=([0-9]+)\\]", "$1"))
                .map(stripColor ? line -> line : line -> line.replaceAll("\\[color=([^\\]]+)\\]", "<$1>"))
                .map(line -> line.replaceAll("\\[[^\\]]+\\]", "")) // remove all square bracket tags
                .collect(Collectors.toList());
    }

    static Optional<String> getRegexGroup(final String str, final String regex, final int group,
            final int... flags) {
        int combinedFlags = 0;

        for (int flag : flags) {
            combinedFlags |= flag;
        }

        final Matcher matcher = Pattern.compile(regex, combinedFlags).matcher(str);

        if (!matcher.find()) {
            return Optional.empty();
        }

        return Optional.of(matcher.group(group));
    }
}

Related

  1. getDocument(CloseableHttpClient client, String url)
  2. getDocument(final String url)
  3. getDocument(String url)
  4. getHtmlDocument(String url)
  5. getIcon(Document doc)
  6. getJSoupHtmlDocument(final String url)
  7. getJSoupXmlDocument(final String url)
  8. getLoginFields(Document doc)
  9. getTextFromAvailableDivID(Document doc, String divID)