Java tutorial
/* Copyright (C) 2003-2015 JabRef contributors. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ package net.sf.jabref.importer; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import net.sf.jabref.Globals; import net.sf.jabref.JabRefPreferences; import net.sf.jabref.exporter.layout.LayoutFormatter; import net.sf.jabref.logic.formatter.Formatter; public class HTMLConverter implements LayoutFormatter, Formatter { private static final Log LOGGER = LogFactory.getLog(HTMLConverter.class); private static final int MAX_TAG_LENGTH = 100; /* Portions International Organization for Standardization 1986: Permission to copy in any form is granted for use with conforming SGML systems and applications as defined in ISO 8879, provided this notice is included in all copies. */ // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents // The symbols can be looked at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command // http://llg.cubic.org/docs/ent2latex.html and http://www.w3.org/TR/xml-entity-names/byalpha.html are also useful // as well as http://www.w3.org/Math/characters/unicode.xml // An array of arrays of strings in the format: // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"} // Leaving a field empty is OK as it then will not be included private final String[][] conversionList = new String[][] { { "160", "nbsp", "\\{~\\}" }, // no-break space = non-breaking space, // U+00A0 ISOnum { "161", "iexcl", "\\{\\\\textexclamdown\\}" }, // inverted exclamation mark, U+00A1 ISOnum { "162", "cent", "\\{\\\\textcent\\}" }, // cent sign, U+00A2 ISOnum { "163", "pound", "\\{\\\\pounds\\}" }, // pound sign, U+00A3 ISOnum { "164", "curren", "\\{\\\\textcurrency\\}" }, // currency sign, U+00A4 ISOnum { "165", "yen", "\\{\\\\textyen\\}" }, // yen sign = yuan sign, U+00A5 ISOnum { "166", "brvbar", "\\{\\\\textbrokenbar\\}" }, // broken bar = broken vertical bar, // U+00A6 ISOnum { "167", "sect", "\\{\\\\S\\}" }, // section sign, U+00A7 ISOnum { "168", "uml", "\\{\\\\\"\\{\\}\\}" }, // diaeresis = spacing diaeresis, // U+00A8 ISOdia { "169", "copy", "\\{\\\\copyright\\}" }, // copyright sign, U+00A9 ISOnum { "170", "ordf", "\\{\\\\textordfeminine\\}" }, // feminine ordinal indicator, U+00AA ISOnum { "171", "laquo", "\\{\\\\guillemotleft\\}" }, // left-pointing double angle quotation mark // = left pointing guillemet, U+00AB ISOnum { "172", "not", "\\$\\\\neg\\$" }, // not sign, U+00AC ISOnum { "173", "shy", "\\\\-" }, // soft hyphen = discretionary hyphen, // U+00AD ISOnum { "174", "reg", "\\{\\\\textregistered\\}" }, // registered sign = registered trade mark sign, // U+00AE ISOnum { "175", "macr", "\\{\\\\=\\{\\}\\}" }, // macron = spacing macron = overline // = APL overbar, U+00AF ISOdia { "176", "deg", "\\$\\\\deg\\$" }, // degree sign, U+00B0 ISOnum { "177", "plusmn", "\\$\\\\pm\\$" }, // plus-minus sign = plus-or-minus sign, // U+00B1 ISOnum { "178", "sup2", "\\\\textsuperscript\\{2\\}" }, // superscript two = superscript digit two // = squared, U+00B2 ISOnum { "179", "sup3", "\\\\textsuperscript\\{3\\}" }, // superscript three = superscript digit three // = cubed, U+00B3 ISOnum { "180", "acute", "\\{\\\\'\\{\\}\\}" }, // acute accent = spacing acute, // U+00B4 ISOdia { "181", "micro", "\\$\\\\mu\\$" }, // micro sign, U+00B5 ISOnum { "", "mu", "\\$\\\\mu\\$" }, // micro sign, U+00B5 ISOnum { "182", "para", "\\{\\\\P\\}" }, // pilcrow sign = paragraph sign, // U+00B6 ISOnum { "183", "middot", "\\$\\\\cdot\\$" }, // middle dot = Georgian comma // = Greek middle dot, U+00B7 ISOnum { "184", "cedil", "\\{\\\\c\\{\\}\\}" }, // cedilla = spacing cedilla, U+00B8 ISOdia { "185", "sup1", "\\\\textsuperscript\\{1\\}" }, // superscript one = superscript digit one, // U+00B9 ISOnum { "186", "ordm", "\\{\\\\textordmasculine\\}" }, // masculine ordinal indicator, // U+00BA ISOnum { "187", "raquo", "\\{\\\\guillemotright\\}" }, // right-pointing double angle quotation mark // = right pointing guillemet, U+00BB ISOnum { "188", "frac14", "\\$\\\\sfrac\\{1\\}\\{4\\}\\$" }, // vulgar fraction one quarter // = fraction one quarter, U+00BC ISOnum { "189", "frac12", "\\$\\\\sfrac\\{1\\}\\{2\\}\\$" }, // vulgar fraction one half // = fraction one half, U+00BD ISOnum { "190", "frac34", "\\$\\\\sfrac\\{3\\}\\{4\\}\\$" }, // vulgar fraction three quarters // = fraction three quarters, U+00BE ISOnum { "191", "iquest", "\\{\\\\textquestiondown\\}" }, // inverted question mark // = turned question mark, U+00BF ISOnum { "192", "Agrave", "\\{\\\\`\\{A\\}\\}" }, // latin capital letter A with grave // = latin capital letter A grave, // U+00C0 ISOlat1 { "193", "Aacute", "\\{\\\\'\\{A\\}\\}" }, // latin capital letter A with acute, // U+00C1 ISOlat1 { "194", "Acirc", "\\{\\\\\\^\\{A\\}\\}" }, // latin capital letter A with circumflex, // U+00C2 ISOlat1 { "195", "Atilde", "\\{\\\\~\\{A\\}\\}" }, // latin capital letter A with tilde, // U+00C3 ISOlat1 { "196", "Auml", "\\{\\\\\"\\{A\\}\\}" }, // latin capital letter A with diaeresis, // U+00C4 ISOlat1 { "197", "Aring", "\\{\\{\\\\AA\\}\\}" }, // latin capital letter A with ring above // = latin capital letter A ring, // U+00C5 ISOlat1 { "198", "AElig", "\\{\\\\AE\\}" }, // latin capital letter AE // = latin capital ligature AE, // U+00C6 ISOlat1 { "199", "Ccedil", "\\{\\\\c\\{C\\}\\}" }, // latin capital letter C with cedilla, // U+00C7 ISOlat1 { "200", "Egrave", "\\{\\\\`\\{E\\}\\}" }, // latin capital letter E with grave, // U+00C8 ISOlat1 { "201", "Eacute", "\\{\\\\'\\{E\\}\\}" }, // latin capital letter E with acute, // U+00C9 ISOlat1 { "202", "Ecirc", "\\{\\\\\\^\\{E\\}\\}" }, // latin capital letter E with circumflex, // U+00CA ISOlat1 { "203", "Euml", "\\{\\\\\"\\{E\\}\\}" }, // latin capital letter E with diaeresis, // U+00CB ISOlat1 { "204", "Igrave", "\\{\\\\`\\{I\\}\\}" }, // latin capital letter I with grave, // U+00CC ISOlat1 { "205", "Iacute", "\\{\\\\'\\{I\\}\\}" }, // latin capital letter I with acute, // U+00CD ISOlat1 { "206", "Icirc", "\\{\\\\\\^\\{I\\}\\}" }, // latin capital letter I with circumflex, // U+00CE ISOlat1 { "207", "Iuml", "\\{\\\\\"\\{I\\}\\}" }, // latin capital letter I with diaeresis, // U+00CF ISOlat1 { "208", "ETH", "\\{\\\\DH\\}" }, // latin capital letter ETH, U+00D0 ISOlat1 { "209", "Ntilde", "\\{\\\\~\\{N\\}\\}" }, // latin capital letter N with tilde, // U+00D1 ISOlat1 { "210", "Ograve", "\\{\\\\`\\{O\\}\\}" }, // latin capital letter O with grave, // U+00D2 ISOlat1 { "211", "Oacute", "\\{\\\\'\\{O\\}\\}" }, // latin capital letter O with acute, // U+00D3 ISOlat1 { "212", "Ocirc", "\\{\\\\\\^\\{O\\}\\}" }, // latin capital letter O with circumflex, // U+00D4 ISOlat1 { "213", "Otilde", "\\{\\\\~\\{O\\}\\}" }, // latin capital letter O with tilde, // U+00D5 ISOlat1 { "214", "Ouml", "\\{\\\\\"\\{O\\}\\}" }, // latin capital letter O with diaeresis, // U+00D6 ISOlat1 { "215", "times", "\\$\\\\times\\$" }, // multiplication sign, U+00D7 ISOnum { "216", "Oslash", "\\{\\\\O\\}" }, // latin capital letter O with stroke // = latin capital letter O slash, // U+00D8 ISOlat1 { "217", "Ugrave", "\\{\\\\`\\{U\\}\\}" }, // latin capital letter U with grave, // U+00D9 ISOlat1 { "218", "Uacute", "\\{\\\\'\\{U\\}\\}" }, // latin capital letter U with acute, // U+00DA ISOlat1 { "219", "Ucirc", "\\{\\\\\\^\\{U\\}\\}" }, // latin capital letter U with circumflex, // U+00DB ISOlat1 { "220", "Uuml", "\\{\\\\\"\\{U\\}\\}" }, // latin capital letter U with diaeresis, // U+00DC ISOlat1 { "221", "Yacute", "\\{\\\\'\\{Y\\}\\}" }, // latin capital letter Y with acute, // U+00DD ISOlat1 { "222", "THORN", "\\{\\\\TH\\}" }, // latin capital letter THORN, // U+00DE ISOlat1 { "223", "szlig", "\\{\\\\ss\\}" }, // latin small letter sharp s = ess-zed, // U+00DF ISOlat1 { "224", "agrave", "\\{\\\\`\\{a\\}\\}" }, // latin small letter a with grave // = latin small letter a grave, // U+00E0 ISOlat1 { "225", "aacute", "\\{\\\\'\\{a\\}\\}" }, // latin small letter a with acute, // U+00E1 ISOlat1 { "226", "acirc", "\\{\\\\\\^\\{a\\}\\}" }, // latin small letter a with circumflex, // U+00E2 ISOlat1 { "227", "atilde", "\\{\\\\~\\{a\\}\\}" }, // latin small letter a with tilde, // U+00E3 ISOlat1 { "228", "auml", "\\{\\\\\"\\{a\\}\\}" }, // latin small letter a with diaeresis, // U+00E4 ISOlat1 { "229", "aring", "\\{\\{\\\\aa\\}\\}" }, // latin small letter a with ring above // = latin small letter a ring, // U+00E5 ISOlat1 { "230", "aelig", "\\{\\\\ae\\}" }, // latin small letter ae // = latin small ligature ae, U+00E6 ISOlat1 { "231", "ccedil", "\\{\\\\c\\{c\\}\\}" }, // latin small letter c with cedilla, // U+00E7 ISOlat1 { "232", "egrave", "\\{\\\\`\\{e\\}\\}" }, // latin small letter e with grave, // U+00E8 ISOlat1 { "233", "eacute", "\\{\\\\'\\{e\\}\\}" }, // latin small letter e with acute, // U+00E9 ISOlat1 { "234", "ecirc", "\\{\\\\\\^\\{e\\}\\}" }, // latin small letter e with circumflex, // U+00EA ISOlat1 { "235", "euml", "\\{\\\\\"\\{e\\}\\}" }, // latin small letter e with diaeresis, // U+00EB ISOlat1 { "236", "igrave", "\\{\\\\`\\{i\\}\\}" }, // latin small letter i with grave, // U+00EC ISOlat1 { "237", "iacute", "\\{\\\\'\\{i\\}\\}" }, // latin small letter i with acute, // U+00ED ISOlat1 { "238", "icirc", "\\{\\\\\\^\\{i\\}\\}" }, // latin small letter i with circumflex, // U+00EE ISOlat1 { "239", "iuml", "\\{\\\\\"\\{i\\}\\}" }, // latin small letter i with diaeresis, // U+00EF ISOlat1 { "240", "eth", "\\{\\\\dh\\}" }, // latin small letter eth, U+00F0 ISOlat1 { "241", "ntilde", "\\{\\\\~\\{n\\}\\}" }, // latin small letter n with tilde, // U+00F1 ISOlat1 { "242", "ograve", "\\{\\\\`\\{o\\}\\}" }, // latin small letter o with grave, // U+00F2 ISOlat1 { "243", "oacute", "\\{\\\\'\\{o\\}\\}" }, // latin small letter o with acute, // U+00F3 ISOlat1 { "244", "ocirc", "\\{\\\\\\^\\{o\\}\\}" }, // latin small letter o with circumflex, // U+00F4 ISOlat1 { "245", "otilde", "\\{\\\\~\\{o\\}\\}" }, // latin small letter o with tilde, // U+00F5 ISOlat1 { "246", "ouml", "\\{\\\\\"\\{o\\}\\}" }, // latin small letter o with diaeresis, // U+00F6 ISOlat1 { "247", "divide", "\\$\\\\div\\$" }, // division sign, U+00F7 ISOnum { "248", "oslash", "\\{\\\\o\\}" }, // latin small letter o with stroke, // = latin small letter o slash, // U+00F8 ISOlat1 { "249", "ugrave", "\\{\\\\`\\{u\\}\\}" }, // latin small letter u with grave, // U+00F9 ISOlat1 { "250", "uacute", "\\{\\\\'\\{u\\}\\}" }, // latin small letter u with acute, // U+00FA ISOlat1 { "251", "ucirc", "\\{\\\\\\^\\{u\\}\\}" }, // latin small letter u with circumflex, // U+00FB ISOlat1 { "252", "uuml", "\\{\\\\\"\\{u\\}\\}" }, // latin small letter u with diaeresis, // U+00FC ISOlat1 { "253", "yacute", "\\{\\\\'\\{y\\}\\}" }, // latin small letter y with acute, // U+00FD ISOlat1 { "254", "thorn", "\\{\\\\th\\}" }, // latin small letter thorn, // U+00FE ISOlat1 { "255", "yuml", "\\{\\\\\"\\{y\\}\\}" }, // latin small letter y with diaeresis, // U+00FF ISOlat1 { "332", "Omacro", "\\{\\\\=\\{O\\}\\}" }, // the small letter o with macron { "333", "omacro", "\\{\\\\=\\{o\\}\\}" }, // the big letter O with macron { "402", "fnof", "\\$f\\$" }, // latin small f with hook = function // = florin, U+0192 ISOtech /* Greek */ { "913", "Alpha", "\\{\\$\\\\Alpha\\$\\}" }, // greek capital letter alpha, U+0391 { "914", "Beta", "\\{\\$\\\\Beta\\$\\}" }, // greek capital letter beta, U+0392 { "915", "Gamma", "\\{\\$\\\\Gamma\\$\\}" }, // greek capital letter gamma, // U+0393 ISOgrk3 { "916", "Delta", "\\{\\$\\\\Delta\\$\\}" }, // greek capital letter delta, // U+0394 ISOgrk3 { "917", "Epsilon", "\\{\\$\\\\Epsilon\\$\\}" }, // greek capital letter epsilon, U+0395 { "918", "Zeta", "\\{\\$\\\\Zeta\\$\\}" }, // greek capital letter zeta, U+0396 { "919", "Eta", "\\{\\$\\\\Eta\\$\\}" }, // greek capital letter eta, U+0397 { "920", "Theta", "\\{\\$\\\\Theta\\$\\}" }, // greek capital letter theta, // U+0398 ISOgrk3 { "921", "Iota", "\\{\\$\\\\Iota\\$\\}" }, // greek capital letter iota, U+0399 { "922", "Kappa", "\\{\\$\\\\Kappa\\$\\}" }, // greek capital letter kappa, U+039A { "923", "Lambda", "\\{\\$\\\\Lambda\\$\\}" }, // greek capital letter lambda, // U+039B ISOgrk3 { "924", "Mu", "\\{\\$\\\\Mu\\$\\}" }, // greek capital letter mu, U+039C { "925", "Nu", "\\{\\$\\\\Nu\\$\\}" }, // greek capital letter nu, U+039D { "926", "Xi", "\\{\\$\\\\Xi\\$\\}" }, // greek capital letter xi, U+039E ISOgrk3 { "927", "Omicron", "\\{\\$\\\\Omicron\\$\\}" }, // greek capital letter omicron, U+039F { "928", "Pi", "\\{\\$\\\\Pi\\$\\}" }, // greek capital letter pi, U+03A0 ISOgrk3 { "929", "Rho", "\\{\\$\\\\Rho\\$\\}" }, // greek capital letter rho, U+03A1 /* there is no Sigmaf, and no U+03A2 character either */ { "931", "Sigma", "\\{\\$\\\\Sigma\\$\\}" }, // greek capital letter sigma, // U+03A3 ISOgrk3 { "932", "Tau", "\\{\\$\\\\Tau\\$\\}" }, // greek capital letter tau, U+03A4 { "933", "Upsilon", "\\{\\$\\\\Upsilon\\$\\}" }, // greek capital letter upsilon, // U+03A5 ISOgrk3 { "934", "Phi", "\\{\\$\\\\Phi\\$\\}" }, // greek capital letter phi, // U+03A6 ISOgrk3 { "935", "Chi", "\\{\\$\\\\Chi\\$\\}" }, // greek capital letter chi, U+03A7 { "936", "Psi", "\\{\\$\\\\Psi\\$\\}" }, // greek capital letter psi, // U+03A8 ISOgrk3 { "937", "Omega", "\\{\\$\\\\Omega\\$\\}" }, // greek capital letter omega, // U+03A9 ISOgrk3 { "945", "alpha", "\\$\\\\alpha\\$" }, // greek small letter alpha, // U+03B1 ISOgrk3 { "946", "beta", "\\$\\\\beta\\$" }, // greek small letter beta, U+03B2 ISOgrk3 { "947", "gamma", "\\$\\\\gamma\\$" }, // greek small letter gamma, // U+03B3 ISOgrk3 { "948", "delta", "\\$\\\\delta\\$" }, // greek small letter delta, // U+03B4 ISOgrk3 { "949", "epsilon", "\\$\\\\epsilon\\$" }, // greek small letter epsilon, // U+03B5 ISOgrk3 { "950", "zeta", "\\$\\\\zeta\\$" }, // greek small letter zeta, U+03B6 ISOgrk3 { "951", "eta", "\\$\\\\eta\\$" }, // greek small letter eta, U+03B7 ISOgrk3 { "952", "theta", "\\$\\\\theta\\$" }, // greek small letter theta, // U+03B8 ISOgrk3 { "953", "iota", "\\$\\\\iota\\$" }, // greek small letter iota, U+03B9 ISOgrk3 { "954", "kappa", "\\$\\\\kappa\\$" }, // greek small letter kappa, // U+03BA ISOgrk3 { "955", "lambda", "\\$\\\\lambda\\$" }, // greek small letter lambda, // U+03BB ISOgrk3 { "956", "mu", "\\$\\\\mu\\$" }, // greek small letter mu, U+03BC ISOgrk3 { "957", "nu", "\\$\\\\nu\\$" }, // greek small letter nu, U+03BD ISOgrk3 { "958", "xi", "\\$\\\\xi\\$" }, // greek small letter xi, U+03BE ISOgrk3 { "959", "omicron", "\\$\\\\omicron\\$" }, // greek small letter omicron, U+03BF NEW { "960", "pi", "\\$\\\\phi\\$" }, // greek small letter pi, U+03C0 ISOgrk3 { "961", "rho", "\\$\\\\rho\\$" }, // greek small letter rho, U+03C1 ISOgrk3 { "962", "sigmaf", "\\$\\\\varsigma\\$" }, // greek small letter final sigma, // U+03C2 ISOgrk3 { "963", "sigma", "\\$\\\\sigma\\$" }, // greek small letter sigma, // U+03C3 ISOgrk3 { "964", "tau", "\\$\\\\tau\\$" }, // greek small letter tau, U+03C4 ISOgrk3 { "965", "upsilon", "\\$\\\\upsilon\\$" }, // greek small letter upsilon, { "", "upsi", "\\$\\\\upsilon\\$" }, // alias // U+03C5 ISOgrk3 { "966", "phi", "\\$\\\\phi\\$" }, // greek small letter phi, U+03C6 ISOgrk3 { "967", "chi", "\\$\\\\chi\\$" }, // greek small letter chi, U+03C7 ISOgrk3 { "968", "psi", "\\$\\\\psi\\$" }, // greek small letter psi, U+03C8 ISOgrk3 { "969", "omega", "\\$\\\\omega\\$" }, // greek small letter omega, // U+03C9 ISOgrk3 { "977", "thetasym", "\\$\\\\vartheta\\$" }, // greek small letter theta symbol, { "", "thetav", "\\$\\\\vartheta\\$" }, // greek small letter theta symbol, { "", "vartheta", "\\$\\\\vartheta\\$" }, // greek small letter theta symbol, // U+03D1 NEW { "978", "upsih", "\\{\\$\\\\Upsilon\\$\\}" }, // greek upsilon with hook symbol, // U+03D2 NEW { "982", "piv", "\\$\\\\varphi\\$" }, // greek pi symbol, U+03D6 ISOgrk3 /* General Punctuation */ { "8226", "bull", "\\$\\\\bullet\\$" }, // bullet = black small circle, // U+2022 ISOpub /* bullet is NOT the same as bullet operator, U+2219 */ { "8230", "hellip", "\\{\\\\ldots\\}" }, // horizontal ellipsis = three dot leader, // U+2026 ISOpub { "8242", "prime", "\\$\\\\prime\\$" }, // prime = minutes = feet, U+2032 ISOtech { "8243", "Prime", "\\$\\{''\\}\\$" }, // double prime = seconds = inches, // U+2033 ISOtech { "8254", "oline", "\\{\\\\=\\{\\}\\}" }, // overline = spacing overscore, // U+203E NEW { "8260", "frasl", "/" }, // fraction slash, U+2044 NEW /* Letterlike Symbols */ { "8472", "weierp", "\\$\\\\wp\\$" }, // script capital P = power set // = Weierstrass p, U+2118 ISOamso { "8465", "image", "\\{\\$\\\\Im\\$\\}" }, // blackletter capital I = imaginary part, // U+2111 ISOamso { "8476", "real", "\\{\\$\\\\Re\\$\\}" }, // blackletter capital R = real part symbol, // U+211C ISOamso { "8482", "trade", "\\{\\\\texttrademark\\}" }, // trade mark sign, U+2122 ISOnum { "8501", "alefsym", "\\$\\\\aleph\\$" }, // alef symbol = first transfinite cardinal, // U+2135 NEW /* alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters */ /* Arrows */ { "8592", "larr", "\\$\\\\leftarrow\\$" }, // leftwards arrow, U+2190 ISOnum { "8593", "uarr", "\\$\\\\uparrow\\$" }, // upwards arrow, U+2191 ISOnum { "8594", "rarr", "\\$\\\\rightarrow\\$" }, // rightwards arrow, U+2192 ISOnum { "8595", "darr", "\\$\\\\downarrow\\$" }, // downwards arrow, U+2193 ISOnum { "8596", "harr", "\\$\\\\leftrightarrow\\$" }, // left right arrow, U+2194 ISOamsa { "8629", "crarr", "\\$\\\\dlsh\\$" }, // downwards arrow with corner leftwards // = carriage return, U+21B5 NEW - require mathabx { "8656", "lArr", "\\{\\$\\\\Leftarrow\\$\\}" }, // leftwards double arrow, U+21D0 ISOtech /* ISO 10646 does not say that lArr is the same as the 'is implied by' arrow but also does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests */ { "8657", "uArr", "\\{\\$\\\\Uparrow\\$\\}" }, // upwards double arrow, U+21D1 ISOamsa { "8658", "rArr", "\\{\\$\\\\Rightarrow\\$\\}" }, // rightwards double arrow, // U+21D2 ISOtech /* ISO 10646 does not say this is the 'implies' character but does not have another character with this function so ? rArr can be used for 'implies' as ISOtech suggests */ { "8659", "dArr", "\\{\\$\\\\Downarrow\\$\\}" }, // downwards double arrow, U+21D3 ISOamsa { "8660", "hArr", "\\{\\$\\\\Leftrightarrow\\$\\}" }, // left right double arrow, // U+21D4 ISOamsa /* Mathematical Operators */ { "8704", "forall", "\\$\\\\forall\\$" }, // for all, U+2200 ISOtech { "8706", "part", "\\$\\\\partial\\$" }, // partial differential, U+2202 ISOtech { "8707", "exist", "\\$\\\\exists\\$" }, // there exists, U+2203 ISOtech { "8709", "empty", "\\$\\\\emptyset\\$" }, // empty set = null set = diameter, // U+2205 ISOamso { "8711", "nabla", "\\$\\\\nabla\\$" }, // nabla = backward difference, // U+2207 ISOtech { "8712", "isin", "\\$\\\\in\\$" }, // element of, U+2208 ISOtech { "8713", "notin", "\\$\\\\notin\\$" }, // not an element of, U+2209 ISOtech { "8715", "ni", "\\$\\\\ni\\$" }, // contains as member, U+220B ISOtech /* should there be a more memorable name than 'ni'? */ { "8719", "prod", "\\$\\\\prod\\$" }, // n-ary product = product sign, // U+220F ISOamsb /* prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both */ { "8721", "sum", "\\$\\\\sum\\$" }, // n-ary sumation, U+2211 ISOamsb /* sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both */ { "8722", "minus", "\\$-\\$" }, // minus sign, U+2212 ISOtech { "8727", "lowast", "\\$\\\\ast\\$" }, // asterisk operator, U+2217 ISOtech { "8730", "radic", "\\$\\\\sqrt{}\\$" }, // square root = radical sign, // U+221A ISOtech { "8733", "prop", "\\$\\\\propto\\$" }, // proportional to, U+221D ISOtech { "8734", "infin", "\\$\\\\infty\\$" }, // infinity, U+221E ISOtech { "8736", "ang", "\\$\\\\angle\\$" }, // angle, U+2220 ISOamso { "8743", "and", "\\$\\\\land\\$" }, // logical and = wedge, U+2227 ISOtech { "8744", "or", "\\$\\\\lor\\$" }, // logical or = vee, U+2228 ISOtech { "8745", "cap", "\\$\\\\cap\\$" }, // intersection = cap, U+2229 ISOtech { "8746", "cup", "\\$\\\\cup\\$" }, // union = cup, U+222A ISOtech { "8747", "int", "\\$\\\\int\\$" }, // integral, U+222B ISOtech { "8756", "there4", "\\$\\\\uptherefore\\$" }, // therefore, U+2234 ISOtech; only in LaTeX package MnSymbol { "8764", "sim", "\\$\\\\sim\\$" }, // tilde operator = varies with = similar to, // U+223C ISOtech /* tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both */ { "8773", "cong", "\\$\\\\cong\\$" }, // approximately equal to, U+2245 ISOtech { "8776", "asymp", "\\$\\\\approx\\$" }, // almost equal to = asymptotic to, // U+2248 ISOamsr { "8800", "ne", "\\$\\\\neq\\$" }, // not equal to, U+2260 ISOtech { "8801", "equiv", "\\$\\\\equiv\\$" }, // identical to, U+2261 ISOtech { "8804", "le", "\\$\\\\leq\\$" }, // less-than or equal to, U+2264 ISOtech { "8805", "ge", "\\$\\\\geq\\$" }, // greater-than or equal to, // U+2265 ISOtech { "8834", "sub", "\\$\\\\subset\\$" }, // subset of, U+2282 ISOtech { "8835", "sup", "\\$\\\\supset\\$" }, // superset of, U+2283 ISOtech /* note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn */ { "8836", "nsub", "\\$\\\\nsubset\\$" }, // not a subset of, U+2284 ISOamsn { "8838", "sube", "\\$\\\\subseteq\\$" }, // subset of or equal to, U+2286 ISOtech { "8839", "supe", "\\$\\\\supseteq\\$" }, // superset of or equal to, // U+2287 ISOtech { "8853", "oplus", "\\$\\\\oplus\\$" }, // circled plus = direct sum, // U+2295 ISOamsb { "8855", "otimes", "\\$\\\\otimes\\$" }, // circled times = vector product, // U+2297 ISOamsb { "8869", "perp", "\\$\\\\perp\\$" }, // up tack = orthogonal to = perpendicular, // U+22A5 ISOtech { "8901", "sdot", "\\$\\\\cdot\\$" }, // dot operator, U+22C5 ISOamsb /* dot operator is NOT the same character as U+00B7 middle dot */ { "8968", "lceil", "\\$\\\\lceil\\$" }, // left ceiling = apl upstile, // U+2308 ISOamsc { "8969", "rceil", "\\$\\\\rceil\\$" }, // right ceiling, U+2309 ISOamsc { "8970", "lfloor", "\\$\\\\lfloor\\$" }, // left floor = apl downstile, // U+230A ISOamsc { "8971", "rfloor", "\\$\\\\rfloor\\$" }, // right floor, U+230B ISOamsc /* Miscellaneous Technical */ { "9001", "lang", "\\$\\\\langle\\$" }, // left-pointing angle bracket = bra, // U+2329 ISOtech /* lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' */ { "9002", "rang", "\\$\\\\rangle\\$" }, // right-pointing angle bracket = ket, // U+232A ISOtech /* rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' */ /* Geometric Shapes */ { "9674", "loz", "\\$\\\\lozenge\\$" }, // lozenge, U+25CA ISOpub /* Miscellaneous Symbols */ { "9824", "spades", "\\$\\\\spadesuit\\$" }, // black spade suit, U+2660 ISOpub /* black here seems to mean filled as opposed to hollow */ { "9827", "clubs", "\\$\\\\clubsuit\\$" }, // black club suit = shamrock, // U+2663 ISOpub { "9829", "hearts", "\\$\\\\heartsuit\\$" }, // black heart suit = valentine, // U+2665 ISOpub { "9830", "diams", "\\$\\\\diamondsuit\\$" }, // black diamond suit, U+2666 ISOpub { "34", "quot", "\"" }, // quotation mark = APL quote, // U+0022 ISOnum { "38", "amp", "\\\\&" }, // ampersand, U+0026 ISOnum { "60", "lt", "\\$<\\$" }, // less-than sign, U+003C ISOnum { "62", "gt", "\\$>\\$" }, // greater-than sign, U+003E ISOnum /* Latin Extended-A */ { "338", "OElig", "\\{\\\\OE\\}" }, // latin capital ligature OE, // U+0152 ISOlat2 { "339", "oelig", "\\{\\\\oe\\}" }, // latin small ligature oe, U+0153 ISOlat2 /* ligature is a misnomer, this is a separate character in some languages */ { "352", "Scaron", "\\{\\\\v\\{S\\}\\}" }, // latin capital letter S with caron, // U+0160 ISOlat2 { "353", "scaron", "\\{\\\\v\\{s\\}\\}" }, // latin small letter s with caron, // U+0161 ISOlat2 { "376", "Yuml", "\\{\\\\\"\\{Y\\}\\}" }, // latin capital letter Y with diaeresis, // U+0178 ISOlat2 /* Spacing Modifier Letters */ { "710", "circ", "\\{\\\\textasciicircum\\}" }, // modifier letter circumflex accent, // U+02C6 ISOpub { "732", "tilde", "\\{\\\\textasciitilde\\}" }, // small tilde, U+02DC ISOdia /* General Punctuation */ { "8194", "ensp", "\\\\hspace\\{0.5em\\}" }, // en space, U+2002 ISOpub { "8195", "emsp", "\\\\hspace\\{1em\\}" }, // em space, U+2003 ISOpub { "8201", "thinsp", "\\\\hspace\\{0.167em\\}" }, // thin space, U+2009 ISOpub { "8204", "zwnj", "" }, // zero width non-joiner, // U+200C NEW RFC 2070 { "8205", "zwj", "" }, // zero width joiner, U+200D NEW RFC 2070 { "8206", "lrm", "" }, // left-to-right mark, U+200E NEW RFC 2070 { "8207", "rlm", "" }, // right-to-left mark, U+200F NEW RFC 2070 { "8211", "ndash", "--" }, // en dash, U+2013 ISOpub { "8212", "mdash", "---" }, // em dash, U+2014 ISOpub { "8216", "lsquo", "\\{\\\\textquoteleft\\}" }, // left single quotation mark, // U+2018 ISOnum { "8217", "rsquo", "\\{\\\\textquoteright\\}" }, // right single quotation mark, // U+2019 ISOnum { "8218", "sbquo", "\\{\\\\quotesinglbase\\}" }, // single low-9 quotation mark, U+201A NEW { "8220", "ldquo", "\\{\\\\textquotedblleft\\}" }, // left double quotation mark, // U+201C ISOnum { "8221", "rdquo", "\\{\\\\textquotedblright\\}" }, // right double quotation mark, // U+201D ISOnum { "8222", "bdquo", "\\{\\\\quotedblbase\\}" }, // double low-9 quotation mark, U+201E NEW { "8224", "dagger", "\\{\\\\dag\\}" }, // dagger, U+2020 ISOpub { "8225", "Dagger", "\\{\\\\ddag\\}" }, // double dagger, U+2021 ISOpub { "8240", "permil", "\\{\\\\textperthousand\\}" }, // per mille sign, U+2030 ISOtech { "8249", "lsaquo", "\\{\\\\guilsinglleft\\}" }, // single left-pointing angle quotation mark, // U+2039 ISO proposed /* lsaquo is proposed but not yet ISO standardized */ { "8250", "rsaquo", "\\{\\\\guilsinglright\\}" }, // single right-pointing angle quotation mark, // U+203A ISO proposed /* rsaquo is proposed but not yet ISO standardized */ { "8364", "euro", "\\{\\\\texteuro\\}" }, // euro sign, U+20AC NEW /* Manually added */ { "35", "", "\\\\#" }, // Hash { "36", "dollar", "\\\\$" }, // Dollar { "37", "percnt", "\\\\%" }, // Percent { "39", "apos", "'" }, // Apostrophe { "40", "lpar", "(" }, // Left bracket { "41", "rpar", ")" }, // Right bracket { "43", "plus", "\\+" }, // Plus { "44", "comma", "," }, // Comma { "45", "hyphen", "-" }, // Hyphen { "46", "period", "\\." }, // Period { "47", "slash", "/" }, // Slash (solidus) { "58", "colon", ":" }, // Colon { "59", "semi", ";" }, // Semi colon { "61", "equals", "=" }, // Equals to { "91", "lsqb", "\\[" }, // Left square bracket { "92", "bsol", "\\{\\\\textbackslash\\}" }, // Backslash { "93", "rsqb", "\\]" }, // Right square bracket { "94", "Hat", "\\{\\\\\\^\\{\\}\\}" }, // Circumflex { "95", "lowbar", "\\\\_" }, // Underscore { "96", "grave", "\\{\\\\`\\{\\}\\}" }, // Grave { "123", "lbrace", "\\\\\\{" }, // Left curly bracket { "", "lcub", "\\\\\\{" }, // Left curly bracket { "124", "vert", "\\|" }, // Vertical bar { "", "verbar", "\\|" }, // Vertical bar { "", "VerticalLine", "\\|" }, // Vertical bar { "125", "rbrace", "\\\\\\}" }, // Right curly bracket { "", "rcub", "\\\\\\}" }, // Right curly bracket { "138", "", "\\{\\\\v\\{S\\}\\}" }, // Line tabulation set // {"141", "", ""}, // Reverse line feed { "145", "", "`" }, // Apostrophe { "146", "", "'" }, // Apostrophe { "147", "", "``" }, // Quotation mark { "148", "", "''" }, // Quotation mark { "150", "", "--" }, // En dash { "154", "", "\\{\\\\v\\{s\\}\\}" }, // Single character introducer { "260", "Aogon", "\\{\\\\k\\{A\\}\\}" }, // capital A with ogonek { "261", "aogon", "\\{\\\\k\\{a\\}\\}" }, // small a with ogonek { "262", "Cacute", "\\{\\\\'\\{C\\}\\}" }, // capital C with acute { "263", "cacute", "\\{\\\\'\\{c\\}\\}" }, // small C with acute { "264", "Ccirc", "\\{\\\\\\^\\{C\\}\\}" }, // capital C with circumflex { "265", "ccirc", "\\{\\\\\\^\\{c\\}\\}" }, // small C with circumflex { "266", "Cdot", "\\{\\\\\\.\\{C\\}\\}" }, // capital C with dot above { "267", "cdot", "\\{\\\\\\.\\{c\\}\\}" }, // small C with dot above { "268", "Ccaron", "\\{\\\\v\\{C\\}\\}" }, // capital C with caron { "269", "ccaron", "\\{\\\\v\\{c\\}\\}" }, // small C with caron { "272", "Dstrok", "\\{\\\\DJ\\}" }, // capital D with stroke { "273", "dstrok", "\\{\\\\dj\\}" }, // small d with stroke { "280", "Eogon", "\\{\\\\k\\{E\\}\\}" }, // capital E with ogonek { "281", "eogon", "\\{\\\\k\\{e\\}\\}" }, // small e with ogonek { "298", "Imacr", "\\{\\\\=\\{I\\}\\}" }, // capital I with macron { "299", "imacr", "\\{\\\\=\\{\\\\i\\}\\}" }, // small i with macron { "302", "Iogon", "\\{\\\\k\\{I\\}\\}" }, // capital I with ogonek { "303", "iogon", "\\{\\\\k\\{i\\}\\}" }, // small i with ogonek { "304", "Idot", "\\{\\\\.\\{I\\}\\}" }, // capital I with dot above { "305", "inodot", "\\{\\\\i\\}" }, // Small i without the dot { "", "imath", "\\{\\\\i\\}" }, // Small i without the dot { "306", "", "\\{\\\\IJ\\}" }, // Dutch di-graph IJ { "307", "", "\\{\\\\ij\\}" }, // Dutch di-graph ij { "312", "", "\\{\\\\textkra\\}" }, // Letter kra { "321", "Lstrok", "\\{\\\\L\\}" }, // upper case L with stroke { "322", "lstrok", "\\{\\\\l\\}" }, // lower case l with stroke { "330", "", "\\{\\\\NG\\}" }, // upper case letter Eng { "331", "", "\\{\\\\ng\\}" }, // lower case letter Eng { "338", "", "\\{\\\\OE\\}" }, // OE-ligature { "339", "", "\\{\\\\oe\\}" }, // oe-ligature { "348", "Scirc", "\\{\\\\\\^\\{S\\}\\}" }, // upper case S with circumflex { "349", "scirc", "\\{\\\\\\^\\{s\\}\\}" }, // lower case s with circumflex { "370", "Uogon", "\\{\\\\k\\{U\\}\\}" }, // capital U with ogonek { "371", "uogon", "\\{\\\\k\\{u\\}\\}" }, // small u with ogonek { "381", "Zcaron", "\\{\\\\v\\{Z\\}\\}" }, // capital Z with caron { "382", "zcaron", "\\{\\\\v\\{z\\}\\}" }, // small z with caron { "405", "", "\\{\\\\hv\\}" }, // small letter Hv { "416", "", "\\{\\\\OHORN\\}" }, // capital O with horn { "417", "", "\\{\\\\ohorn\\}" }, // small o with horn { "431", "", "\\{\\\\UHORN\\}" }, // capital U with horn { "432", "", "\\{\\\\uhorn\\}" }, // small u with horn { "490", "Oogon", "\\{\\\\k\\{O\\}\\}" }, // capital letter O with ogonek { "491", "oogon", "\\{\\\\k\\{o\\}\\}" }, // small letter o with ogonek { "492", "", "\\{\\\\k\\{\\\\=\\{O\\}\\}\\}" }, // capital letter O with ogonek and macron { "493", "", "\\{\\\\k\\{\\\\=\\{o\\}\\}\\}" }, // small letter o with ogonek and macron { "536", "", "\\{\\\\cb\\{S\\}\\}" }, // capital letter S with comma below, require combelow { "537", "", "\\{\\\\cb\\{s\\}\\}" }, // small letter S with comma below, require combelow { "538", "", "\\{\\\\cb\\{T\\}\\}" }, // capital letter T with comma below, require combelow { "539", "", "\\{\\\\cb\\{t\\}\\}" }, // small letter T with comma below, require combelow { "727", "caron", "\\{\\\\v\\{\\}\\}" }, // Caron { "", "Hacek", "\\{\\\\v\\{\\}\\}" }, // Caron { "728", "breve", "\\{\\\\u\\{\\}\\}" }, // Breve { "", "Breve", "\\{\\\\u\\{\\}\\}" }, // Breve { "729", "dot", "\\{\\\\\\.\\{\\}\\}" }, // Dot above { "730", "ring", "\\{\\\\r\\{\\}\\}" }, // Ring above { "731", "ogon", "\\{\\\\k\\{\\}\\}" }, // Ogonek { "733", "dblac", "\\{\\\\H\\{\\}\\}" }, // Double acute { "949", "epsi", "\\$\\\\epsilon\\$" }, // Epsilon - double check { "1013", "epsiv", "\\$\\\\varepsilonup\\$" }, // lunate epsilon, requires txfonts { "1055", "", "\\{\\\\cyrchar\\\\CYRP\\}" }, // Cyrillic capital Pe { "1082", "", "\\{\\\\cyrchar\\\\cyrk\\}" }, // Cyrillic small Ka // {"2013", "", ""}, // NKO letter FA -- Maybe en dash = 0x2013? // {"2014", "", ""}, // NKO letter FA -- Maybe em dash = 0x2014? { "8192", "", "\\\\hspace\\{0.5em\\}" }, // en quad { "8193", "", "\\\\hspace\\{1em\\}" }, // em quad { "8196", "", "\\\\hspace\\{0.333em\\}" }, // Three-Per-Em Space { "8197", "", "\\\\hspace\\{0.25em\\}" }, // Four-Per-Em Space { "8198", "", "\\\\hspace\\{0.167em\\}" }, // Six-Per-Em Space { "8208", "hyphen", "-" }, // Hyphen { "8229", "nldr", "\\.\\." }, // Double dots - en leader { "8241", "", "\\{\\\\textpertenthousand\\}" }, // per ten thousands sign { "8244", "", "\\{\\\\prime\\\\prime\\\\prime\\}" }, // triple prime { "8251", "", "\\{\\\\textreferencemark\\}" }, { "8253", "", "\\{\\\\textinterrobang\\}" }, { "8450", "complexes", "\\$\\\\mathbb\\{C\\}\\$" }, // double struck capital C -- requires e.g. amsfonts { "8451", "", "\\$\\\\deg\\$\\{C\\}" }, // Degree Celsius { "8459", "Hscr", "\\$\\\\mathcal\\{H\\}\\$" }, // script capital H -- possibly use \mathscr { "8460", "Hfr", "\\$\\\\mathbb\\{H\\}\\$" }, // black letter capital H -- requires e.g. amsfonts { "8466", "Lscr", "\\$\\\\mathcal\\{L\\}\\$" }, // script capital L -- possibly use \mathscr { "8467", "ell", "\\{\\\\ell\\}" }, // script small l { "8469", "naturals", "\\$\\\\mathbb\\{N\\}\\$" }, // double struck capital N -- requires e.g. amsfonts { "8474", "Qopf", "\\$\\\\mathbb\\{Q\\}\\$" }, // double struck capital Q -- requires e.g. amsfonts { "8477", "reals", "\\$\\\\mathbb\\{R\\}\\$" }, // double struck capital R -- requires e.g. amsfonts { "8486", "", "\\$\\{\\\\Omega\\}\\$" }, // Omega { "8491", "angst", "\\{\\\\AA\\}" }, // Angstrom { "8496", "Escr", "\\$\\\\mathcal\\{E\\}\\$" }, // script capital E { "8531", "frac13", "\\$\\\\sfrac\\{1\\}\\{3\\}\\$" }, // Vulgar fraction one third { "8532", "frac23", "\\$\\\\sfrac\\{2\\}\\{3\\}\\$" }, // Vulgar fraction two thirds { "8533", "frac15", "\\$\\\\sfrac\\{1\\}\\{5\\}\\$" }, // Vulgar fraction one fifth { "8534", "frac25", "\\$\\\\sfrac\\{2\\}\\{5\\}\\$" }, // Vulgar fraction two fifths { "8535", "frac35", "\\$\\\\sfrac\\{3\\}\\{5\\}\\$" }, // Vulgar fraction three fifths { "8536", "frac45", "\\$\\\\sfrac\\{4\\}\\{5\\}\\$" }, // Vulgar fraction four fifths { "8537", "frac16", "\\$\\\\sfrac\\{1\\}\\{6\\}\\$" }, // Vulgar fraction one sixth { "8538", "frac56", "\\$\\\\sfrac\\{5\\}\\{6\\}\\$" }, // Vulgar fraction five sixths { "8539", "frac18", "\\$\\\\sfrac\\{1\\}\\{8\\}\\$" }, // Vulgar fraction one eighth { "8540", "frac38", "\\$\\\\sfrac\\{3\\}\\{8\\}\\$" }, // Vulgar fraction three eighths { "8541", "frac58", "\\$\\\\sfrac\\{5\\}\\{8\\}\\$" }, // Vulgar fraction five eighths { "8542", "frac78", "\\$\\\\sfrac\\{7\\}\\{8\\}\\$" }, // Vulgar fraction seven eighths { "8710", "", "\\$\\\\triangle\\$" }, // Increment - could use a more appropriate symbol { "8714", "", "\\$\\\\in\\$" }, // Small element in { "8723", "mp", "\\$\\\\mp\\$" }, // Minus-plus { "8729", "bullet", "\\$\\\\bullet\\$" }, // Bullet operator { "8758", "ratio", ":" }, // Colon/ratio { "8771", "sime", "\\$\\\\simeq\\$" }, // almost equal to = asymptotic to, { "8776", "ap", "\\$\\\\approx\\$" }, // almost equal to = asymptotic to, { "8810", "ll", "\\$\\\\ll\\$" }, // Much less than { "", "Lt", "\\$\\\\ll\\$" }, // Much less than { "8811", "gg", "\\$\\\\gg\\$" }, // Much greater than { "", "Gt", "\\$\\\\gg\\$" }, // Much greater than { "8818", "lsim", "\\$\\\\lesssim\\$" }, // Less than or equivalent to { "8819", "gsim", "\\$\\\\gtrsim\\$" }, // Greater than or equivalent to { "8862", "boxplus", "\\$\\\\boxplus\\$" }, // Boxed plus -- requires amssymb { "8863", "boxminus", "\\$\\\\boxminus\\$" }, // Boxed minus -- requires amssymb { "8864", "boxtimes", "\\$\\\\boxtimes\\$" }, // Boxed times -- requires amssymb { "8882", "vltri", "\\$\\\\triangleleft\\$" }, // Left triangle { "8883", "vrtri", "\\$\\\\triangleright\\$" }, // Right triangle { "8896", "xwedge", "\\$\\\\bigwedge\\$" }, // Big wedge { "8897", "xvee", "\\$\\\\bigvee\\$" }, // Big vee { "8942", "vdots", "\\$\\\\vdots\\$" }, // vertical ellipsis U+22EE { "8943", "cdots", "\\$\\\\cdots\\$" }, // midline horizontal ellipsis U+22EF /*{"8944", "", "\\$\\\\ddots\\$"}, // up right diagonal ellipsis U+22F0 */ { "8945", "ddots", "\\$\\\\ddots\\$" }, // down right diagonal ellipsis U+22F1 { "9426", "circledc", "\\{\\\\copyright\\}" }, // circled small letter C { "9633", "square", "\\$\\\\square\\$" }, // White square { "9651", "xutri", "\\$\\\\bigtriangleup\\$" }, // White up-pointing big triangle { "9653", "utri", "\\$\\\\triangle\\$" }, // White up-pointing small triangle -- \vartriangle probably // better but requires amssymb { "10877", "les", "\\$\\\\leqslant\\$" }, // Less than slanted equal -- requires amssymb { "10878", "ges", "\\$\\\\geqslant\\$" }, // Less than slanted equal -- requires amssymb { "119978", "Oscr", "\\$\\\\mathcal\\{O\\}\\$" }, // script capital O -- possibly use \mathscr { "119984", "Uscr", "\\$\\\\mathcal\\{U\\}\\$" } // script capital U -- possibly use \mathscr }; // List of combining accents private final String[][] accentList = new String[][] { { "768", "`" }, // Grave { "769", "'" }, // Acute { "770", "\\^" }, // Circumflex { "771", "~" }, // Tilde { "772", "=" }, // Macron { "773", "=" }, // Overline - not completely correct { "774", "u" }, // Breve { "775", "\\." }, // Dot above { "776", "\"" }, // Diaeresis { "777", "h" }, // Hook above { "778", "r" }, // Ring { "779", "H" }, // Double acute { "780", "v" }, // Caron { "781", "\\|" }, // Vertical line above { "782", "U" }, // Double vertical line above { "783", "G" }, // Double grave { "784", "textdotbreve" }, // Candrabindu { "785", "t" }, // Inverted breve // {"786", ""}, // Turned comma above // {"787", ""}, // Comma above { "788", "textrevcommaabove" }, // Reversed comma above { "789", "textcommaabover" }, // Comma above right { "790", "textsubgrave" }, // Grave accent below -requires tipa { "791", "textsubacute" }, // Acute accent below - requires tipa { "792", "textadvancing" }, // Left tack below - requires tipa { "793", "textretracting" }, // Right tack below - requires tipa { "794", "textlangleabove" }, // Left angle above { "795", "textrighthorn" }, // Horn { "796", "textsublhalfring" }, // Left half ring below - requires tipa { "797", "textraising" }, // Up tack below - requires tipa { "798", "textlowering" }, // Down tack below - requires tipa { "799", "textsubplus" }, // Plus sign below - requires tipa { "800", "textsubbar" }, // Minus sign below { "801", "textpalhookbelow" }, // Palatalized hook below { "802", "M" }, // Retroflex hook below - textrethookbelow? { "803", "d" }, // Dot below { "804", "textsubumlaut" }, // Diaeresis below - requires tipa { "805", "textsubring" }, // Ring below - requires tipa { "806", "cb" }, // Comma below - requires combelow { "807", "c" }, // Cedilla { "808", "k" }, // Ogonek { "809", "textsyllabic" }, // Vertical line below - requires tipa { "810", "textsubbridge" }, // Bridge below - requires tipa { "811", "textsubw" }, // Inverted double arch below - requires tipa { "812", "textsubwedge" }, // Caron below { "813", "textsubcircum" }, // Circumflex accent below - requires tipa { "814", "textsubbreve" }, // Breve below { "815", "textsubarch" }, // Inverted breve below - requires tipa { "816", "textsubtilde" }, // Tilde below - requires tipa { "817", "b" }, // Macron below - not completely correct { "818", "b" }, // Underline { "819", "subdoublebar" }, // Double low line -- requires extraipa { "820", "textsuperimposetilde" }, // Tilde overlay - requires tipa { "821", "B" }, // Short stroke overlay - textsstrokethru? { "822", "textlstrokethru" }, // Long stroke overlay { "823", "textsstrikethru" }, // Short solidus overlay { "824", "textlstrikethru" }, // Long solidus overlay { "825", "textsubrhalfring" }, // Right half ring below - requires tipa { "826", "textinvsubbridge" }, // inverted bridge below - requires tipa { "827", "textsubsquare" }, // Square below - requires tipa { "828", "textseagull" }, // Seagull below - requires tipa { "829", "textovercross" }, // X above - requires tipa // {"830", ""}, // Vertical tilde // {"831", ""}, // Double overline // {"832", ""}, // Grave tone mark // {"833", ""}, // Acute tone mark // {"834", ""}, // Greek perispomeni // {"835", ""}, // Greek koronis // {"836", ""}, // Greek dialytika tonos // {"837", ""}, // Greek ypogegrammeni { "838", "overbridge" }, // Bridge above - requires extraipa { "839", "subdoublebar" }, // Equals sign below - requires extraipa { "840", "subdoublevert" }, // Double vertical line below - requires extraipa { "841", "subcorner" }, // Left angle below - requires extraipa { "842", "crtilde" }, // Not tilde above - requires extraipa { "843", "dottedtilde" }, // Homothetic above - requires extraipa { "844", "doubletilde" }, // Almost equal to above - requires extraipa { "845", "spreadlips" }, // Left right arrow below - requires extraipa { "846", "whistle" }, // Upwards arrow below - requires extraipa { "861", "textdoublebreve" }, // Double breve { "862", "textdoublemacron" }, // Double macron { "863", "textdoublemacronbelow" }, // Double macron below { "864", "textdoubletilde" }, // Double tilde { "865", "texttoptiebar" }, // Double inverted breve { "866", "sliding" }, // Double rightwards arrow below - requires extraipa }; private final Map<String, String> escapedSymbols = new HashMap<>(); private final Map<Integer, String> escapedAccents = new HashMap<>(); private final Map<Integer, String> numSymbols = new HashMap<>(); private final Map<Character, String> unicodeSymbols = new HashMap<>(); public HTMLConverter() { super(); for (String[] aConversionList : conversionList) { if (!(aConversionList[2].isEmpty())) { if (!(aConversionList[1].isEmpty())) { escapedSymbols.put("&" + aConversionList[1] + ";", aConversionList[2]); } if (!(aConversionList[0].isEmpty())) { numSymbols.put(Integer.decode(aConversionList[0]), aConversionList[2]); if (Integer.decode(aConversionList[0]) > 128) { Character c = (char) Integer.decode(aConversionList[0]).intValue(); unicodeSymbols.put(c, aConversionList[2]); // System.err.println(Integer.decode(conversionList[i][0]).toString() + ": " + c.toString() + ": "+ conversionList[i][2]); } } } } for (String[] anAccentList : accentList) { escapedAccents.put(Integer.decode(anAccentList[0]), anAccentList[1]); } } public String formatUnicode(String text) { if (text == null) { return null; } Set<Character> chars = unicodeSymbols.keySet(); for (Character character : chars) { // System.err.println(new Integer((int) character).toString() + ": " + character.toString() + ": " + unicodeSymbols.get(character)); text = text.replaceAll(character.toString(), unicodeSymbols.get(character)); } Integer cp; for (int i = 0; i <= (text.length() - 1); i++) { cp = text.codePointAt(i); if (cp >= 129) { LOGGER.warn("Unicode character not converted: " + cp); } } return text; } @Override public String format(String text) { if (text == null) { return null; } StringBuffer sb = new StringBuffer(); // Deal with the form <sup>k</sup>and <sub>k</sub> // If the result is in text or equation form can be controlled // From the "Advanced settings" tab if (Globals.prefs.getBoolean(JabRefPreferences.USE_CONVERT_TO_EQUATION)) { text = text.replaceAll("<[ ]?sup>([^<]+)</sup>", "\\$\\^\\{$1\\}\\$"); text = text.replaceAll("<[ ]?sub>([^<]+)</sub>", "\\$_\\{$1\\}\\$"); } else { text = text.replaceAll("<[ ]?sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}"); text = text.replaceAll("<[ ]?sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}"); } // TODO: maybe rewrite this based on regular expressions instead // Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to // remove tags for its image alt-tag to equation converter for (int i = 0; i < text.length(); i++) { int c = text.charAt(i); if (c == '<') { i = readTag(text, i); } else { sb.append((char) c); } } text = sb.toString(); // Handle text based HTML entities Set<String> patterns = escapedSymbols.keySet(); for (String pattern : patterns) { text = text.replaceAll(pattern, escapedSymbols.get(pattern)); } // Handle numerical HTML entities Pattern escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);"); Matcher m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3)); if (numSymbols.containsKey(num)) { text = text.replaceAll("&#" + m.group(1) + m.group(2) + m.group(3) + ";", numSymbols.get(num)); } } escapedPattern = Pattern.compile("(.)&#([x]*)([0]*)(\\p{XDigit}+);"); m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4)); if (escapedAccents.containsKey(num)) { if ("i".equals(m.group(1))) { text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\i\\}\\}"); } else if ("j".equals(m.group(1))) { text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{\\\\j\\}\\}"); } else { text = text.replaceAll(m.group(1) + "&#" + m.group(2) + m.group(3) + m.group(4) + ";", "\\{\\\\" + escapedAccents.get(num) + "\\{" + m.group(1) + "\\}\\}"); } } } escapedPattern = Pattern.compile("&#([x]*)([0]*)(\\p{XDigit}+);"); m = escapedPattern.matcher(text); while (m.find()) { // System.err.println("Found pattern: " + m.group(1)); // System.err.println("Found pattern: " + m.group(2)); int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3)); LOGGER.warn("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num)); } // Remove $$ in case of two adjacent conversions text = text.replace("$$", ""); // Find non-covered special characters with alphabetic codes escapedPattern = Pattern.compile("&(\\w+);"); m = escapedPattern.matcher(text); while (m.find()) { LOGGER.warn("HTML escaped char not converted: " + m.group(1)); } return text.trim(); } /*private final int MAX_TAG_LENGTH = 30;*/ /*private final int MAX_CHAR_LENGTH = 10; private int readHtmlChar(String text, StringBuffer sb, int position) { // Have just read the < character that starts the tag. int index = text.indexOf(';', position); if ((index > position) && (index-position < MAX_CHAR_LENGTH)) { //String code = text.substring(position, index); //System.out.println("Removed code: "+text.substring(position, index)); return index; // Just skip the tag. } else return position; // Don't do anything. }*/ private int readTag(String text, int position) { // Have just read the < character that starts the tag. int index = text.indexOf('>', position); if ((index > position) && ((index - position) < MAX_TAG_LENGTH)) { return index; // Just skip the tag. } else { return position; // Don't do anything. } } @Override public String getName() { return "HTMLConverter"; } }