List of usage examples for java.util Locale ROOT
Locale ROOT
To view the source code for java.util Locale ROOT.
Click Source Link
From source file:mekhq.Utilities.java
/** * Run through the directory and call parser.parse(fis) for each XML file found. *///from w w w . java 2 s .c o m public static void parseXMLFiles(String dirName, FileParser parser, boolean recurse) { final String METHOD_NAME = "parseXMLFiles(String,FileParser,boolean)"; //$NON-NLS-1$ if (null == dirName || null == parser) { throw new NullPointerException(); } File dir = new File(dirName); if (dir.isDirectory()) { File[] files = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.toLowerCase(Locale.ROOT).endsWith(".xml"); //$NON-NLS-1$ } }); if (null != files && files.length > 0) { // Case-insensitive sorting. Yes, even on Windows. Deal with it. Arrays.sort(files, new Comparator<File>() { @Override public int compare(File f1, File f2) { return f1.getPath().compareTo(f2.getPath()); } }); // Try parsing and updating the main list, one by one for (File file : files) { if (file.isFile()) { try (FileInputStream fis = new FileInputStream(file)) { parser.parse(fis); } catch (Exception ex) { // Ignore this file then MekHQ.getLogger().log(Utilities.class, METHOD_NAME, LogLevel.ERROR, "Exception trying to parse " + file.getPath() + " - ignoring."); //$NON-NLS-1$ //$NON-NLS-2$ MekHQ.getLogger().log(Utilities.class, METHOD_NAME, ex); } } } } if (!recurse) { // We're done return; } // Get subdirectories too File[] dirs = dir.listFiles(); if (null != dirs && dirs.length > 0) { Arrays.sort(dirs, new Comparator<File>() { @Override public int compare(File f1, File f2) { return f1.getPath().compareTo(f2.getPath()); } }); for (File subDirectory : dirs) { if (subDirectory.isDirectory()) { parseXMLFiles(subDirectory.getPath(), parser, recurse); } } } } }
From source file:net.yacy.cora.document.id.MultiProtocolURL.java
public final String language() { String language = "en"; if (this.host == null) return language; final int pos = this.host.lastIndexOf('.'); String host_tld = this.host.substring(pos + 1).toLowerCase(Locale.ROOT); if (pos == 0) return language; int length = this.host.length() - pos - 1; switch (length) { case 2:/*from w w w . j a v a2 s.com*/ char firstletter = host_tld.charAt(0); switch (firstletter) {//speed-up case 'a': if (host_tld.equals("au")) {//Australia /91,000,000 language = "en";//australian english; eng; eng; ause } else if (host_tld.equals("at")) {//Austria /23,000,000 language = "de";//german; ger (deu); deu } else if (host_tld.equals("ar")) {//Argentina /10,700,000 language = "es";//spanish } else if (host_tld.equals("ae")) {//United Arab Emirates /3,310,000 language = "ar";//arabic } else if (host_tld.equals("am")) {//Armenia /2,080,000 language = "hy";//armenian; arm (hye); hye } else if (host_tld.equals("ac")) {//Ascension Island /2,060,000 language = "en";//english } else if (host_tld.equals("az")) {//Azerbaijan /1,340,000 language = "az";//azerbaijani; aze; aze (azj, azb) } else if (host_tld.equals("ag")) {//Antigua and Barbuda /1,310,000 language = "en";//english } else if (host_tld.equals("as")) {//American Samoa /1,220,000 language = "en";//english } else if (host_tld.equals("al")) {//Albania /389,000 language = "sq";//albanian; alb (sqi); sqi } else if (host_tld.equals("ad")) {//Andorra /321,000 language = "ca";//catalan; cat } else if (host_tld.equals("ao")) {//Angola /153,000 language = "pt";//portuguese } else if (host_tld.equals("ai")) {//Anguilla /149,000 language = "en";//english } else if (host_tld.equals("af")) {//Afghanistan /101,000 language = "ps";//pashto; pus } else if (host_tld.equals("an")) {//Netherlands Antilles /78,100 language = "nl";//dutch } else if (host_tld.equals("aq")) {//Antarctica /36,000 language = "en";//can be any } else if (host_tld.equals("aw")) {//Aruba /34,400 language = "nl";//dutch } else if (host_tld.equals("ax")) {//Aland Islands /28 language = "sv";//swedish } break; case 'b': if (host_tld.equals("br")) {//Brazil /25,800,000 language = "pt";//portuguese } else if (host_tld.equals("be")) {//Belgium /25,100,000 language = "nl";//dutch } else if (host_tld.equals("bg")) {//Bulgaria /3,480,000 language = "bg";//bulgarian; bul } else if (host_tld.equals("bz")) {//Belize /2,790,000 language = "en";//english } else if (host_tld.equals("ba")) {//Bosnia and Herzegovina /2,760,000 language = "sh";//serbo-croatian } else if (host_tld.equals("by")) {//Belarus /2,540,000 language = "be";//belarusian; bel } else if (host_tld.equals("bo")) {//Bolivia /1,590,000 language = "es";//spanish; spa //language = "qu";//quechua; que //language = "ay";//aymara; aym (ayr) //und viele andere (indian) } else if (host_tld.equals("bd")) {//Bangladesh /342,000 language = "bn";//bengali; ben } else if (host_tld.equals("bw")) {//Botswana /244,000 //language = "en";//english language = "tn";//tswana; tsn } else if (host_tld.equals("bh")) {//Bahrain /241,000 language = "ar";//arabic } else if (host_tld.equals("bf")) {//Burkina Faso /239,000 language = "fr";//french } else if (host_tld.equals("bm")) {//Bermuda /238,000 language = "en";//english } else if (host_tld.equals("bn")) {//Brunei Darussalam /157,000 language = "ms";//malay; msa/mhp } else if (host_tld.equals("bb")) {//Barbados /131,000 language = "en";//english } else if (host_tld.equals("bt")) {//Bhutan /123,000 language = "dz";//dzongkha; dzo } else if (host_tld.equals("bi")) {//Burundi /60,600 language = "rn";//kirundi; run } else if (host_tld.equals("bs")) {//Bahamas /37,700 language = "en";//english } else if (host_tld.equals("bj")) {//Benin /36,200 language = "fr";//french; fra (fre); fra } else if (host_tld.equals("bv")) {//Bouvet Island /55 language = "no";//norwegian; nor (nob/nno) } break; case 'c': if (host_tld.equals("ca")) {//Canada /165,000,000 language = "en";//english //language = "fr";//french } else if (host_tld.equals("ch")) {//Switzerland /62,100,000 language = "de";//german; gsw } else if (host_tld.equals("cn")) {//People's Republic of China /26,700,000 language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("cz")) {//Czech Republic /18,800,000 language = "cs";//czech; cze (ces); ces } else if (host_tld.equals("cl")) {//Chile /18,500,000 language = "es";//spanish; spa } else if (host_tld.equals("co")) {//Colombia /4,270,000 language = "es";//spanish; spa } else if (host_tld.equals("cc")) {//Cocos (Keeling) Islands /4,050,000 language = "en";//english } else if (host_tld.equals("cr")) {//Costa Rica /2,060,000 language = "es";//spanish; spa } else if (host_tld.equals("cy")) {//Cyprus /2,500,000 language = "el";//greek; gre (ell); ell } else if (host_tld.equals("cu")) {//Cuba /2,040,000 language = "es";//spanish; spa } else if (host_tld.equals("cx")) {//Christmas Island /1,830,000 language = "en";//english } else if (host_tld.equals("cd")) {//Democratic Republic of the Congo /475,000 language = "fr";//french } else if (host_tld.equals("cg")) {//Republic of the Congo /193,000 language = "fr";//french } else if (host_tld.equals("cm")) {//Cameroon /119,000 //language = "fr";//french language = "en";//english } else if (host_tld.equals("ci")) {//Cote d'Ivoire /95,200 language = "fr";//french } else if (host_tld.equals("cv")) {//Cape Verde /81,900 language = "pt";//portuguese; por } else if (host_tld.equals("ck")) {//Cook Islands /43,300 language = "en";//english //language = "";//cook islands maori; rar (pnh, rkh) } else if (host_tld.equals("cf")) {//Central African Republic /703 language = "sg";//sango; sag; 92% could speak //language = "fr";//french; fra (fre); fra; 22,5% could speak, but maybe inet users prefer this } break; case 'd': if (host_tld.equals("dk")) {//Denmark /19,700,000 language = "da";//danish; dan } else if (host_tld.equals("do")) {//Dominican Republic /1,510,000 language = "es";//spanish; spa } else if (host_tld.equals("dz")) {//Algeria /326,000 language = "ar";//arabic; ara; arq } else if (host_tld.equals("dj")) {//Djibouti /150,000 language = "ar";//arabic; ara; 94% are muslims, so arabic is primary //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("dm")) {//Dominica /30,100 language = "en";//english } break; case 'e': if (host_tld.equals("ee")) {//Estonia /6,790,000 language = "et";//estonian; est; est (ekk) } else if (host_tld.equals("eg")) {//Egypt /2,990,000 language = "ar";//modern standard arabic; ara; arb //language = "ar";//egyptian arabic; ara; arz } else if (host_tld.equals("ec")) {//Ecuador /2,580,000 language = "es";//spanish; spa } else if (host_tld.equals("et")) {//Ethiopia /142,000 language = "am";//amharic; amh } else if (host_tld.equals("eu")) {//European Union /45,100 language = "en";//english (what can be else) } else if (host_tld.equals("er")) {//Eritrea /15,800 language = "ti";//tigrinya; tir } break; case 'f': if (host_tld.equals("fr")) {//France /96,700,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("fi")) {//Finland /28,100,000 language = "fi";//finnish; fin (92%) } else if (host_tld.equals("fm")) {//Federated States of Micronesia /4,580,000 language = "en";//english //all native at regional level } else if (host_tld.equals("fo")) {//Faroe Islands /623,000 language = "fo";//faroese; fao } else if (host_tld.equals("fj")) {//Fiji /466,000 language = "fj";//fijian; fij //also english, fiji hindi etc } else if (host_tld.equals("fk")) {//Falkland Islands /10,500 language = "en";//english } break; case 'g': if (host_tld.equals("gr")) {//Greece /13,500,000 language = "el";//greek; gre (ell); ell } else if (host_tld.equals("ge")) {//Georgia /2,480,000 language = "ka";//georgian; geo (kat); kat } else if (host_tld.equals("gt")) {//Guatemala /904,000 language = "es";//spanish; spa } else if (host_tld.equals("gs")) {//South Georgia and the South Sandwich Islands /772,000 language = "en";//english } else if (host_tld.equals("gl")) {//Greenland /526,000 language = "kl";//greenlandic; kal } else if (host_tld.equals("gg")) {//Guernsey /322,000 language = "en";//english } else if (host_tld.equals("gi")) {//Gibraltar /193,000 language = "en";//english } else if (host_tld.equals("gh")) {//Ghana /107,000 language = "en";//english } else if (host_tld.equals("gy")) {//Guyana /68,700 language = "en";//english } else if (host_tld.equals("gm")) {//Gambia /59,300 language = "en";//english } else if (host_tld.equals("gn")) {//Guinea /18,700 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("ga")) {//Gabon /17,900 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gd")) {//Grenada /13,600 language = "en";//english } else if (host_tld.equals("gu")) {//Guam /12,800 //language = "ch";//chamorro; cha (looks like young generation don't want to use) language = "en";//english } else if (host_tld.equals("gq")) {//Equatorial Guinea /1,450 language = "es";//spanish; spa } else if (host_tld.equals("gp")) {//Guadeloupe /980 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gf")) {//French Guiana /926 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("gb")) {//United Kingdom of Great Britain and Northern Ireland (currently->uk) /186 language = "en";//english } else if (host_tld.equals("gw")) {//Guinea-Bissau /26 language = "pt";//portuguese; por } break; case 'h': if (host_tld.equals("hu")) {//Hungary /18,500,000 language = "hu";//hungarian; hun } else if (host_tld.equals("hk")) {//Hong Kong /9,510,000 language = "zh";//chinese; chi (zho, cmn) //also english } else if (host_tld.equals("hr")) {//Croatia /6,080,000 language = "hr";//croatian; hrv } else if (host_tld.equals("hn")) {//Honduras /628,000 language = "es";//spanish; spa } else if (host_tld.equals("hm")) {//Heard and McDonald Islands /194,000 language = "en";//english } else if (host_tld.equals("ht")) {//Haiti /17,700 language = "fr";//french; fre (fra); fra //language = "ht";//haitian creole; hat } break; case 'i': if (host_tld.equals("it")) {//Italy /55,200,000 language = "it";//italian; ita } else if (host_tld.equals("il")) {//Israel /17,800,000 language = "he";//hebrew; heb } else if (host_tld.equals("ie")) {//Republic of Ireland + Northern Ireland /17,000,000 language = "ga";//irish; gle //language = "en";//english } else if (host_tld.equals("in")) {//India /9,330,000 language = "hi";//hindi; hin } else if (language.equals("is")) {//Iceland /5,310,000 language = "is";//icelandic; ice (isl); isl } else if (host_tld.equals("ir")) {//Islamic Republic of Iran /2,940,000 language = "fa";//persian; per (fas); pes } else if (host_tld.equals("im")) {//Isle of Man /276,000 language = "en";//english //language = "gv";//manx; glv (was dead, currently only slogans etc basically) } else if (host_tld.equals("io")) {//British Indian Ocean Territory /108,000 language = "en";//english } else if (host_tld.equals("iq")) {//Iraq /133 language = "ar";//arabic; ara; acm //language = "ku";//kurdish; kur } break; case 'j': if (host_tld.equals("jp")) {//Japan /139,000,000 language = "ja";//japanese; jpn } else if (host_tld.equals("jo")) {//Jordan /601,000 language = "ar";//jordanian arabic; ara; ajp //language = "en";//english (businness) } else if (host_tld.equals("jm")) {//Jamaica /290,000 language = "en";//english } else if (host_tld.equals("je")) {//Jersey /202,000 language = "en";//english } break; case 'k': if (host_tld.equals("kr")) {//Republic of Korea /13,700,000 language = "ko";//korean; kor } else if (host_tld.equals("kz")) {//Kazakhstan /2,680,000 language = "kk";//kazakh; kaz //language = "ru";//russian; rus (de-facto is widely used than native language) } else if (host_tld.equals("kg")) {//Kyrgyzstan /1,440,000 language = "ky";//kyrgyz; kir //language = "ru";//russian; rus (perhaps this one here is widely used) } else if (host_tld.equals("ki")) {//Kiribati /427,000 //language = "";//kiribati; gil (this one must be used, but don't have ISO 639-1) (!) language = "en";//english //here also can be other languages: .de.ki = deutsch } else if (host_tld.equals("kw")) {//Kuwait /356,000 language = "ar";//arabic; ara } else if (host_tld.equals("ke")) {//Kenya /301,000 language = "sw";//swahili; swa; swh //language = "en";//english } else if (host_tld.equals("kh")) {//Cambodia /262,000 language = "km";//khmer; khm } else if (host_tld.equals("ky")) {//Cayman Islands /172,000 language = "en";//english } else if (host_tld.equals("kn")) {//Saint Kitts and Nevis /9,830 language = "en";//english } else if (host_tld.equals("km")) {//Comoros /533 //Comorian dialects ISO 639-3: zdj, wni, swb, wlc - must be used here language = "ar";//arabic; ara //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("kp")) {//Democratic People's Republic of Korea /122 language = "ko";//korean; kor } break; case 'l': if (host_tld.equals("lv")) {//Latvia /6,970,000 language = "lv";//latvian; lav; lvs } else if (host_tld.equals("lt")) {//Lithuania /6,040,000 language = "lt";//lithuanian; lit } else if (host_tld.equals("lu")) {//Luxembourg /4,940,000 language = "lb";//luxembourgish; ltz (West Central German language familie; official 1984) //wide spoken, but not business or media //language = "fr";//french; fre (fra); fra (business) //language = "de";//german; ger (deu); ltz (media) } else if (host_tld.equals("li")) {//Liechtenstein /3,990,000 language = "de";//german; ger (deu); deu } else if (host_tld.equals("lb")) {//Lebanon /1,890,000 language = "ar";//arabic; ara } else if (host_tld.equals("lk")) {//Sri Lanka /1,770,000 language = "si";//sinhala; sin //language = "ta";//tamil; tam } else if (host_tld.equals("la")) {//Laos (Lao Peoples Democratic Republic) /932,000 language = "lo";//lao; lao } else if (host_tld.equals("ly")) {//Libya /388,000 language = "ar";//libyan arabic; ara; ayl } else if (host_tld.equals("lc")) {//Saint Lucia /86,400 language = "en";//english //language = "";//french creole; acf (ISO 639-3) //ISO 639-1 is missed + not official, but this is 95% speaking language - must be first (!) } else if (host_tld.equals("ls")) {//Lesotho /81,900 language = "st";//sotho; sot (97%) //language = "en";//english } else if (host_tld.equals("lr")) {//Liberia /588 language = "en";//english } break; case 'm': if (host_tld.equals("mx")) {//Mexico /13,700,000 language = "es";//spanish; spa } else if (host_tld.equals("my")) {//Malaysia /4,610,000 language = "en";//english (business) //language = "";//malaysian; zsm, zlm (maybe must be used here, but no ISO 639-1,2) } else if (host_tld.equals("md")) {//Moldova /3,230,000 language = "ro";//romanian; rum (ron); ron } else if (host_tld.equals("ma")) {//Morocco /3,030,000 language = "ar";//moroccan arabic; ara; ary //language = "fr";//french; fre (fra); fra //language = "";//amazigh (berber); ber; tzm (no ISO 639-1 code) } else if (host_tld.equals("mk")) {//Republic of Macedonia /2,980,000 language = "mk";//macedonian; mac (mkd); mkd } else if (host_tld.equals("ms")) {//Montserrat /2,160,000 language = "en";//english } else if (host_tld.equals("mt")) {//Malta /1,650,000 language = "mt";//maltese; mlt //100% speak Maltese, 88% English, 66% Italian //(but about 75-80% of sites have default english, support of maltese have ~50% of sites) } else if (host_tld.equals("mo")) {//Macau /1,310,000 language = "zh";//chinese; chi (zho); yue (cantonese) } else if (host_tld.equals("mn")) {//Mongolia /1,160,000 language = "mn";//Mongolian; mon; mon: khk } else if (host_tld.equals("mp")) {//Northern Mariana Islands /861,000 language = "en";//english //language = "ch";//chamorro; cha //language = "";//carolinian; ISO 639-3: cal (no ISO 639-1) } else if (host_tld.equals("mu")) {//Mauritius /651,000 language = "fr";//french; fre (fra); fra, mfe (predominant on media) //language = "en";//english (goverment) } else if (host_tld.equals("mm")) {//Myanmar /367,000 language = "my";//burmese; bur (mya); mya } else if (host_tld.equals("mc")) {//Monaco /307,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("me")) {//Montenegro /? language = "sh";//montenegrin (~serbo-croatian, near serbian); scr, scc; hbs (macrolanguage): srp (serbian) } else if (host_tld.equals("mz")) {//Mozambique /288,000 language = "pt";//portuguese; por //language = "";//makhuwa; vmw (ISO 639-3) } else if (host_tld.equals("mg")) {//Madagascar /255,000 language = "mg";//malagasy; mlg (mlg); mlg (macrolanguage): plt //language = "fr";//french; fre (fra); fra //malagasy is native language, but elite want to french } else if (host_tld.equals("mr")) {//Mauritania /210,000 language = "ar";//arabic; ara; mey //language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mv")) {//Maldives /125,000 language = "dv";//dhivehi; div //English is used widely in commerce and increasingly in government schools. } else if (host_tld.equals("mw")) {//Malawi /87,000 //language = "ny";//chewa; nya language = "en";//english (founded sites in english only, include goverment) } else if (host_tld.equals("ml")) {//Mali /73,500 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mq")) {//Martinique /19,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("mh")) {//Marshall Islands /53 language = "mh";//marshallese; mah //language = "en";//english } break; case 'n': if (host_tld.equals("no")) {//Norway /32,300,000 language = "no";//norwegian; nor (nob/nno) } else if (host_tld.equals("nz")) {//New Zealand /18,500,000 language = "en";//english //language = "mi";//maori; mao (mri); mri (4.2%) } else if (host_tld.equals("nu")) {//Niue /5,100,000 language = "en";//english //language = "";//niuean; niu (no ISO 639-1) (97.4% of native, but most are bilingual in English) } else if (host_tld.equals("ni")) {//Nicaragua /4,240,000 language = "es";//spanish; spa } else if (host_tld.equals("np")) {//Nepal /1,910,000 language = "ne";//nepali; nep } if (host_tld.equals("na")) {//Namibia /1,650,000 language = "af";//afrikaans; afr //language = "de";//German; ger (deu); deu //language = "ng";//ndonga (ovambo); kua (ndo); ndo //language = "en";//english //Official is English. //Northern majority of Namibians speak Oshiwambo as first language, //whereas the most widely understood and spoken Afrikaans. //Younger generation most widely understood English and Afrikaans. //Afrikaans is spoken by 60% of the WHITE community, German is spoken by 32%, //English is spoken by 7% and Portuguese by 1%. } else if (host_tld.equals("nr")) {//Nauru /466,000 //language = "na";//Nauruan; nau (50% - 66% at home) language = "en";//english (goverment + business, also .co.nr is free so here can be any) } else if (host_tld.equals("nc")) {//New Caledonia /265,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("ne")) {//Niger /151,000 language = "fr";//french; fre (fra); fra (official and elite) //language = "ha";//hausa; hau (50%) } else if (host_tld.equals("ng")) {//Nigeria /101,000 language = "en";//english } else if (host_tld.equals("nf")) {//Norfolk Island /54,900 language = "en";//english } break; case 'o': if (host_tld.equals("om")) {//Oman /204,000 language = "ar";//omani arabic; ara; acx //language = "en";//english (education and science is ar/en, but people speak mostly arabic) } break; case 'p': if (host_tld.equals("pl")) {//Poland /20,100,000 language = "pl";//polish; pol } else if (host_tld.equals("pt")) {//Portugal /9,100,000 language = "pt";//portuguese; por } else if (host_tld.equals("ph")) {//Philippines /4,080,000 language = "tl";//filipino; fil //language = "en";//english } else if (host_tld.equals("pk")) {//Pakistan /3,180,000 language = "ur";//urdu; urd (lingua franca and national language) //language = "en";//english (official language and used in business, government, and legal contracts) //language = "";//pakistani english;6:pake //(sase: South-Asian-English, engs: English Spoken) //language = "pa";//punjabi; pan //language = "ps";//pashto; pus; pst, pbt //language = "sd";//sindhi; snd //also Saraiki skr (no 1,2) and Balochi bal; bal (bgp, bgn, bcc) (no 1) } else if (host_tld.equals("pw")) {//Palau /3,010,000 language = "en";//english //language = "";//palauan; pau (no ISO 639-1) //language = "tl";//tagalog; tgl //language = "ja";//japanese; jpn } else if (host_tld.equals("pe")) {//Peru /2,740,000 language = "es";//spanish; spa (83.9%) //language = "qu";//quechua; que (13.2%) } else if (host_tld.equals("pr")) {//Puerto Rico /1,920,000 language = "es";//spanish; spa } else if (host_tld.equals("pa")) {//Panama /1,040,000 language = "es";//spanish; spa } else if (host_tld.equals("py")) {//Paraguay /962,000 language = "gn";//guarani; grn; gug (90%) //language = "es";//spanish; spa (87%) } else if (host_tld.equals("ps")) {//Palestinian territories /559,000 language = "ar";//palestinian arabic; ara; ajp } else if (host_tld.equals("pf")) {//French Polynesia /240,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("pg")) {//Papua New Guinea /211,000 language = "en";//english (also pidgin Tok Pisin) //language = "ho";//hiri motu; hmo } else if (host_tld.equals("pn")) {//Pitcairn Islands /80,900 language = "en";//english/pitkern (english creole); pih (ISO 639-3) //language = "en";//english (second language in schools) } else if (host_tld.equals("pm")) {//Saint-Pierre and Miquelon /184 language = "fr";//french; fre (fra); fra } break; case 'q': if (host_tld.equals("qa")) {//Qatar /259,000 language = "ar";//gulf arabic; ara; afb } break; case 'r': if (host_tld.equals("ru")) {//Russia /67,900,000 language = "ru";//russian; rus } else if (host_tld.equals("ro")) {//Romania /7,990,000 language = "ro";//daco-romanian; rum (ron); ron } else if (host_tld.equals("rs")) {//Serbia /? language = "sr";//serbian; srp } else if (host_tld.equals("re")) {//Reunion /146,000 language = "fr";//french; fre (fra); fra, rcf (Reunion Creole) } else if (host_tld.equals("rw")) {//Rwanda /131,000 language = "rw";//kinyarwanda; kin //language = "en";//english //language = "fr";//french; fre (fra); fra //language = "sw";//swahili; swa } break; case 's': if (host_tld.equals("se")) {//Sweden /39,000,000 language = "sv";//swedish; swe } else if (host_tld.equals("es")) {//Spain /31,000,000 language = "es";//spanish; spa } else if (host_tld.equals("sg")) {//Singapore /8,770,000 language = "zh";//singaporean mandarin (chinese); chi (zho); cmn (49.9%) //language = "en";//english (business, government and medium of instruction in schools) (32.3%) //language = "ms";//malay; may (msa); msa, zsm ("national language") (12.2%) //language = "ta";//tamil; tam } else if (host_tld.equals("sk")) {//Slovakia /8,040,000 language = "sk";//slovak; slo (slk); slk } else if (host_tld.equals("si")) {//Slovenia /4,420,000 language = "sl";//slovene; slv } else if (host_tld.equals("su")) {//Soviet Union /3,530,000 language = "ru";//russian; rus } else if (host_tld.equals("sa")) {//Saudi Arabia /2,770,000 language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("st")) {//Sao Tome and Principe /2,490,000 language = "pt";//portuguese; por (95%) //language = "pt";//forro (creole); por; cri (85%) //language = "pt";//angolar (creole); cpp; aoa (3%) //language = "fr";//french; fre (fra); fra (Francophonie -> learns in schools) } else if (host_tld.equals("sv")) {//El Salvador /1,320,000 language = "es";//spanish; spa //language = "";//nahuatl; nah; nlv and others (no ISO 639-1) //language = "";//mayan; myn (no ISO 639-1,3) //language = "";//q'eqchi'; kek (no ISO 639-1,2) } else if (host_tld.equals("sc")) {//Seychelles /949,000 language = "en";//english //language = "fr";//french; fre (fra); fra //language = "fr";//seychellois creole; fre (fra); crs } else if (host_tld.equals("sh")) {//Saint Helena /547,000 language = "en";//english } else if (host_tld.equals("sn")) {//Senegal /503,000 language = "wo";//wolof; wol (80%) //language = "fr";//french; fre (fra); fra //(understood ~15%-20% of all males and ~1%-2% of all women, but official) } else if (host_tld.equals("sr")) {//Suriname /242,000 language = "nl";//dutch; dut (nld); nld (education, government, business and the media) //language = "en";//sranan (suriname creole); srn; srn //language = "bh";//bhojpuri (Surinamese Hindi is a dialect of Bhojpuri); bho //language = "jv";//javanese; jvn } else if (host_tld.equals("sm")) {//San Marino /225,000 language = "it";//italian; ita } else if (host_tld.equals("sy")) {//Syria /115,000 language = "ar";//syrian arabic; ara; apc, ajp //language = "ku";//kurmanji (kurdish); kur; kmr } else if (host_tld.equals("sz")) {//Swaziland /81,500 language = "ss";//swazi; ssw //language = "en";//english } else if (host_tld.equals("sl")) {//Sierra Leone /13,800 language = "en";//Sierra Leone Krio (english); eng; kri (97% spoken) //language = "en";//english (official) } else if (host_tld.equals("sb")) {//Solomon Islands /11,800 language = "en";//Pijin (Solomons Pidgin or Neo-Solomonic); cpe; pis //language = "en";//english (12%) } else if (host_tld.equals("sd")) {//Sudan /11,700 language = "ar";//sudanese arabic; ara; apd //language = "en";//english //english and arabic promoted by goverment (english for education and official) } else if (host_tld.equals("so")) {//Somalia /512 language = "so";//somali; som //language = "ar";//hadhrami arabic; ara; ayh //language = "en";//english //language = "it";//italian; ita //language = "sw";//bravanese (swahili); swa; swh } else if (host_tld.equals("ss")) {//South Sudan /? language = "en";//english //language = "ar";//juba arabic; ara; pga //language = "";//dinka; din (no ISO 639-1) //English and Juba Arabic are the official languages, although Dinka is the most widely spoken } break; case 't': if (host_tld.equals("tw")) {//Republic of China (Taiwan) /14,000,000 language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("tr")) {//Turkey /8,310,000 language = "tr";//turkish; tur } else if (host_tld.equals("tv")) {//Tuvalu /7,170,000 //used for TV, domain currently operated by dotTV, a VeriSign company //the Tuvalu government owns twenty percent of the company //language = "";//tuvaluan; tvl (no ISO 639-1) (close to Maori(mi), Tahitian(ty), Samoan(sm), Tongan(to)) language = "en";//english } else if (host_tld.equals("th")) {//Thailand /6,470,000 language = "th";//thai; tha } else if (host_tld.equals("tc")) {//Turks and Caicos Islands /2,610,000 //language = "en";//english language = "en";//turks and caicos islands creole; eng; tch } else if (host_tld.equals("to")) {//Tonga /2,490,000 //Often used unofficially for Torrent, Toronto, or Tokyo language = "to";//tongan; ton //language = "en";//english } else if (host_tld.equals("tk")) {//Tokelau /2,170,000 //Also used as a free domain service to the public (so maybe english here) language = "to";//tokelauan; tvl/ton; tkl (no ISO 639-1,2) //to - has marked similarities to the Niuafo'ou language of Tonga //tvl - Tokelauan is a Polynesian language closely related to Tuvaluan //language = "en";//english (main language is Tokelauan, but English is also spoken) } else if (host_tld.equals("tt")) {//Trinidad and Tobago /1,170,000 language = "en";//trinidadian english (official) //language = "en";//trinidadian creole; eng; trf (main spoken) //language = "en";//tobagonian creole; eng; tgh (main spoken) } else if (host_tld.equals("tn")) {//Tunisia /1,060,000 language = "ar";//tunisian arabic; ara; aeb } else if (host_tld.equals("tf")) {//French Southern and Antarctic Lands /777,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("tz")) {//Tanzania /405,000 language = "sw";//swahili; swa; swh //language = "en";//english (Higher courts, higher education) } else if (host_tld.equals("tj")) {//Tajikistan /153,000 language = "tg";//tajik; tgk //language = "ru";//russian; rus (wide in businness) } else if (host_tld.equals("tp")) {//East Timor /151,000 language = "pt";//portuguese; por //language = "en";//english } else if (host_tld.equals("tm")) {//Turkmenistan /136,000 language = "tk";//turkmen; tuk } else if (host_tld.equals("tg")) {//Togo /36,000 language = "fr";//french; fre (fra); fra } else if (host_tld.equals("tl")) {//East Timor (Timor-Leste) /18,100 //language = "";//tetum; tet (no ISO 639-1) language = "id";//indonesian; ind //language = "pt";//portuguese; por (5% literally, 25-50% listeners) //language = "en";//english } else if (host_tld.equals("td")) {//Chad /332 language = "ar";//chadian arabic; ara; shu //language = "ar";//arabic; ara //language = "fr";//french; fre (fra); fra } break; case 'u': if (host_tld.equals("uk")) {//United Kingdom of Great Britain and Northern Ireland /473,000,000 language = "en";//english } else if (host_tld.equals("us")) {//United States of America /68,300,000 language = "en";//english } else if (host_tld.equals("ua")) {//Ukraine /6,820,000 language = "uk";//ukrainian; ukr } else if (host_tld.equals("uz")) {//Uzbekistan /2,610,000 language = "uz";//uzbek; uzb //language = "ru";//russian; rus (14% native) } else if (host_tld.equals("uy")) {//Uruguay /2,020,000 language = "es";//spanish; spa //language = "en";//english } else if (host_tld.equals("ug")) {//Uganda /337,000 language = "sw";//swahili; swa; swc //language = "en";//english (also ugandan english) //language = "lg";//ganda; lug (not all territory) } break; case 'v': if (host_tld.equals("vu")) {//Vanuatu /5,050,000 language = "en";//english (education) //language = "bi";//bislama; bis (creole language, used as pidgin) //language = "fr";//french; fre (fra); fra (education) //many native languages, but no-one primary } else if (host_tld.equals("ve")) {//Venezuela /3,050,000 language = "es";//spanish; spa //language = "en";//english //language = "it";//italian; ita //also many indigenous languages } else if (host_tld.equals("vn")) {//Vietnam /2,490,000 language = "vi";//vietnamese; vie } else if (host_tld.equals("va")) {//Vatican City /852,000 language = "it";//italian; ita } else if (host_tld.equals("vg")) {//British Virgin Islands /882,000 language = "en";//english //language = "en";//virgin islands creole english; eng; vic } else if (host_tld.equals("vc")) {//Saint Vincent and the Grenadines /239,000 language = "en";//english //language = "en";//vincentiancreole; eng; svc (home and friends) //language = "bh";//bhojpuri; bho (east indian language) //native indians 2% and no data about their language } else if (host_tld.equals("vi")) {//United States Virgin Islands /202,000 language = "en";//english //language = "en";//virgin islands creole english; eng; vic //language = "es";//spanish; spa //language = "fr";//french; fre (fra); fra } break; case 'w': if (host_tld.equals("ws")) {//Samoa /3,000,000 language = "sm";//Samoan; smo (most people) //but maybe english from the world also (!) } else if (host_tld.equals("wf")) {//Wallis and Futuna /30 language = "fr";//french; fre (fra); fra //language = "";//wallisian; wls (no ISO 639-1,2) //language = "";//futunan; fud (no ISO 639-1,2) //could: wallisian+futunan=88.5%; french=78.2% //had no knowledge: wallisian|futunan=7.2%; french=17.3% (!) } break; case 'x': break; case 'y': if (host_tld.equals("yu")) {//Yugoslavia /3,270,000 language = "sh";//serbo-croatian; scr, scc; hbs (srp, hrv, bos) } else if (host_tld.equals("ye")) {//Yemen /93,800 language = "ar";//yemeni arabic; ara; ayh (hadhrami), ayn (aanaani), acq(ta'izzi-adeni) } else if (host_tld.equals("yt")) {//Mayotte /34 language = "fr";//french; fre (fra); fra (55% read/write) //language = "sw";//maore comorian; swa; swb (41% r/w) //language = "ar";//yemeni arabic; ara (33% r/w) } break; case 'z': if (host_tld.equals("za")) {//South Africa /16,400,000 //language = "zu";//zulu; zul (23.8%) //language = "xh";//xhosa; xho (17.6%) language = "af";//afrikaans; afr (13.3%) //language = "en";//english; (8.2%, but language of commerce and science) //need research (!) } else if (host_tld.equals("zw")) {//Zimbabwe /507,000 language = "sn";//shona; sna (70%) //language = "nd";//ndebele; nde (20%) //language = "en"//english (2.5%, but traditionally used for official business) } else if (host_tld.equals("zm")) {//Zambia /324,000 language = "en";//english (official business and is the medium of instruction in schools) //language = "ny";//chewa; nya } break; } break; case 3: if (host_tld.equals("cat")) {//Catalan linguistic and cultural community /22,479 language = "ca";//catalan; cat } break; case 8: if (host_tld.equals("xn--p1ai")) {//Russia/Cyrillic /67,900,000* language = "ru";//russian; rus } else if (host_tld.equals("xn--node")) {//Georgia/Georgian /2,480,000* language = "ka";//georgian; geo (kat); kat //Proposed } break; case 9: if (host_tld.equals("xn--j1amh")) {//Ukraine/Cyrillic /6,820,000* language = "uk";//ukrainian; ukr //Proposed } break; case 10: if (host_tld.equals("xn--fiqs8s")) {//China/Simplified Chinese /26,700,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--fiqz9s")) {//China/Traditional Chinese /26,700,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--o3cw4h")) {//Thailand/Thai script /6,470,000* language = "th";//thai; tha } else if (host_tld.equals("xn--wgbh1c")) {//Egypt/Arabic /2,990,000* language = "ar";//modern standard arabic; ara; arb } else if (host_tld.equals("xn--wgbl6a")) {//Qatar/Arabic /259,000* language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("xn--90a3ac")) {//Serbia/Cyrillic /? language = "sr";//serbian; srp } else if (host_tld.equals("xn--wgv71a")) {//Japan/Japanese /139,000,000* language = "ja";//japanese; jpn //Proposed } break; case 11: if (host_tld.equals("xn--kprw13d")) {//Taiwan/Simplified Chinese /14,000,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--kpry57d")) {//Taiwan/Simplified Chinese /14,000,000* language = "zh";//chinese; chi (zho); cmn - Mandarin (Modern Standard Mandarin) } else if (host_tld.equals("xn--j6w193g")) {//Hong Kong/Traditional Chinese /9,510,000* language = "zh";//chinese; chi (zho, cmn) } else if (host_tld.equals("xn--h2brj9c")) {//India/Devanagari /9,330,000* language = "hi";//hindi; hin } else if (host_tld.equals("xn--gecrj9c")) {//India/Gujarati /9,330,000* language = "gu";//gujarati; guj //also can be Kutchi and Hindi } else if (host_tld.equals("xn--s9brj9c")) {//India/Gurmukhi /9,330,000* language = "pa";//punjabi; pan } else if (host_tld.equals("xn--45brj9c")) {//India/Bengali /9,330,000* language = "bn";//bengali; ben } else if (host_tld.equals("xn--pgbs0dh")) {//Tunisia/Arabic /1,060,000* language = "ar";//tunisian arabic; ara; aeb } else if (host_tld.equals("xn--80ao21a")) {//Kazakhstan/Cyrillic /2,680,000* language = "kk";//kazakh; kaz //Proposed } break; case 12: if (host_tld.equals("xn--3e0b707e")) {//South Korea/Hangul /13,700,000* language = "ko";//korean; kor } else if (host_tld.equals("xn--mgbtf8fl")) {//Syria/Arabic /115,000* language = "ar";//syrian arabic; ara; apc, ajp } else if (host_tld.equals("xn--4dbrk0ce")) {//Israel/Hebrew /17,800,000* language = "he";//hebrew; heb //Proposed } else if (host_tld.equals("xn--mgb9awbf")) {//Oman/Arabic /204,000 language = "ar";//omani arabic; ara; acx //Proposed } else if (host_tld.equals("xn--mgb2ddes")) {//Yemen/Arabic /93,800* language = "ar";//yemeni arabic; ara; ayh (hadhrami), ayn (aanaani), acq(ta'izzi-adeni) //Proposed } break; case 13: if (host_tld.equals("xn--fpcrj9c3d")) {//India/Telugu /9,330,000* language = "te";//telugu; tel } else if (host_tld.equals("xn--yfro4i67o")) {//Singapore/Chinese /8,770,000* language = "zh";//singaporean mandarin (chinese); chi (zho); cmn } else if (host_tld.equals("xn--fzc2c9e2c")) {//Sri Lanka/Sinhala language /1,770,000* language = "si";//sinhala; sin } else if (host_tld.equals("xn--ygbi2ammx")) {//Palestinian Territory/Arabic /559,000* language = "ar";//palestinian arabic; ara; ajp } break; case 14: if (host_tld.equals("xn--mgbbh1a71e")) {//India/Urdu /9,330,000* language = "ur";//urdu; urd } else if (host_tld.equals("xn--mgbaam7a8h")) {//United Arab Emirates/Arabic /3,310,000* language = "ar";//arabic } else if (host_tld.equals("xn--mgbayh7gpa")) {//Jordan/Arabic /601,000* language = "ar";//jordanian arabic; ara; ajp } else if (host_tld.equals("xn--mgbx4cd0ab")) {//Malaysia/Arabic(Jawi alphabet?) /4,610,000* language = "ar";//arabic //Proposed (why not malay?) } else if (host_tld.equals("xn--54b7fta0cc")) {//Bangladesh/Bengali /342,000* language = "bn";//bengali; ben //Proposed } break; case 15: if (host_tld.equals("xn--mgbc0a9azcg")) {//Morocco/Arabic /3,030,000* language = "ar";//moroccan arabic; ara; ary } else if (host_tld.equals("xn--mgba3a4f16a")) {//Iran/Persian /2,940,000* language = "fa";//persian; per (fas); pes } else if (host_tld.equals("xn--lgbbat1ad8j")) {//Algeria/Arabic /326,000* language = "ar";//arabic; ara; arq } break; case 16: if (host_tld.equals("xn--xkc2al3hye2a")) {//Sri Lanka/Tamil /1,770,000* language = "ta";//tamil; tam } break; case 17: if (host_tld.equals("xn--xkc2dl3a5ee0h")) {//India/Tamil /9,330,000* language = "ta";//tamil; tam //Badaga (ISO 639-3:bfq), Irula (ISO 639-3:iru), Paniya (ISO 639-3:pcg) } else if (host_tld.equals("xn--mgberp4a5d4ar")) {//Saudi Arabia/Arabic /2,770,000* language = "ar";//gulf arabic; ara; afb } else if (host_tld.equals("xn--mgbai9azgqp6j")) {//Pakistan/Arabic /3,180,000* language = "ar";//arabic //Proposed (why not urdu?) //language = "ur";//urdu; urd (lingua franca and national language) } break; case 22: if (host_tld.equals("xn--clchc0ea0b2g2a9gcd")) {//Singapore/Tamil /8,770,000* language = "ta";//tamil; tam } //* - stats from ccTLD break; default: break; } //6: ISO 639-6 Part 6: Alpha-4 - most of small languages from ISO 639-3 not exists. //ISO 639-2 languages included, but not all. return language; }
From source file:com.facebook.GraphRequest.java
private static void processGraphObjectProperty(String key, Object value, KeyValueSerializer serializer, boolean passByValue) throws IOException { Class<?> valueClass = value.getClass(); if (JSONObject.class.isAssignableFrom(valueClass)) { JSONObject jsonObject = (JSONObject) value; if (passByValue) { // We need to pass all properties of this object in key[propertyName] format. @SuppressWarnings("unchecked") Iterator<String> keys = jsonObject.keys(); while (keys.hasNext()) { String propertyName = keys.next(); String subKey = String.format("%s[%s]", key, propertyName); processGraphObjectProperty(subKey, jsonObject.opt(propertyName), serializer, passByValue); }//from w w w. j ava2 s . c o m } else { // Normal case is passing objects by reference, so just pass the ID or URL, if any, // as the value for "key" if (jsonObject.has("id")) { processGraphObjectProperty(key, jsonObject.optString("id"), serializer, passByValue); } else if (jsonObject.has("url")) { processGraphObjectProperty(key, jsonObject.optString("url"), serializer, passByValue); } else if (jsonObject.has(NativeProtocol.OPEN_GRAPH_CREATE_OBJECT_KEY)) { processGraphObjectProperty(key, jsonObject.toString(), serializer, passByValue); } } } else if (JSONArray.class.isAssignableFrom(valueClass)) { JSONArray jsonArray = (JSONArray) value; int length = jsonArray.length(); for (int i = 0; i < length; ++i) { String subKey = String.format(Locale.ROOT, "%s[%d]", key, i); processGraphObjectProperty(subKey, jsonArray.opt(i), serializer, passByValue); } } else if (String.class.isAssignableFrom(valueClass) || Number.class.isAssignableFrom(valueClass) || Boolean.class.isAssignableFrom(valueClass)) { serializer.writeString(key, value.toString()); } else if (Date.class.isAssignableFrom(valueClass)) { Date date = (Date) value; // The "Events Timezone" platform migration affects what date/time formats Facebook // accepts and returns. Apps created after 8/1/12 (or apps that have explicitly enabled // the migration) should send/receive dates in ISO-8601 format. Pre-migration apps can // send as Unix timestamps. Since the future is ISO-8601, that is what we support here. // Apps that need pre-migration behavior can explicitly send these as integer timestamps // rather than Dates. final SimpleDateFormat iso8601DateFormat = new SimpleDateFormat(ISO_8601_FORMAT_STRING, Locale.US); serializer.writeString(key, iso8601DateFormat.format(date)); } }
From source file:com.gargoylesoftware.htmlunit.javascript.host.html.HTMLDocument.java
private boolean hasCommand(final String cmd) { if (null == cmd) { return false; }//from w w w .j a va 2 s . c o m final String cmdLC = cmd.toLowerCase(Locale.ROOT); if (getBrowserVersion().isIE()) { return EXECUTE_CMDS_IE.contains(cmdLC); } if (getBrowserVersion().isChrome()) { return EXECUTE_CMDS_CHROME.contains(cmdLC); } return EXECUTE_CMDS_FF.contains(cmdLC); }
From source file:com.gargoylesoftware.htmlunit.html.HtmlPage.java
/** * Gets the meta tag for a given {@code http-equiv} value. * @param httpEquiv the {@code http-equiv} value * @return a list of {@link HtmlMeta}// w ww .ja v a 2s . co m */ protected List<HtmlMeta> getMetaTags(final String httpEquiv) { if (getDocumentElement() == null) { return Collections.emptyList(); // weird case, for instance if document.documentElement has been removed } final String nameLC = httpEquiv.toLowerCase(Locale.ROOT); final List<HtmlMeta> tags = getDocumentElement().getHtmlElementsByTagName("meta"); for (final Iterator<HtmlMeta> iter = tags.iterator(); iter.hasNext();) { final HtmlMeta element = iter.next(); if (!nameLC.equals(element.getHttpEquivAttribute().toLowerCase(Locale.ROOT))) { iter.remove(); } } return tags; }
From source file:org.elasticsearch.client.RequestConvertersTests.java
private static void resizeTest(ResizeType resizeType, CheckedFunction<ResizeRequest, Request, IOException> function) throws IOException { String[] indices = randomIndicesNames(2, 2); ResizeRequest resizeRequest = new ResizeRequest(indices[0], indices[1]); resizeRequest.setResizeType(resizeType); Map<String, String> expectedParams = new HashMap<>(); setRandomMasterTimeout(resizeRequest, expectedParams); setRandomTimeout(resizeRequest::timeout, resizeRequest.timeout(), expectedParams); if (randomBoolean()) { CreateIndexRequest createIndexRequest = new CreateIndexRequest(randomAlphaOfLengthBetween(3, 10)); if (randomBoolean()) { createIndexRequest.settings(randomIndexSettings()); }//from w w w .j a v a2s .c o m if (randomBoolean()) { randomAliases(createIndexRequest); } resizeRequest.setTargetIndex(createIndexRequest); } setRandomWaitForActiveShards(resizeRequest::setWaitForActiveShards, expectedParams); Request request = function.apply(resizeRequest); assertEquals(HttpPut.METHOD_NAME, request.getMethod()); String expectedEndpoint = "/" + resizeRequest.getSourceIndex() + "/_" + resizeType.name().toLowerCase(Locale.ROOT) + "/" + resizeRequest.getTargetIndexRequest().index(); assertEquals(expectedEndpoint, request.getEndpoint()); assertEquals(expectedParams, request.getParameters()); assertToXContentBody(resizeRequest, request.getEntity()); }
From source file:com.facebook.GraphRequest.java
private static String getUserAgent() { if (userAgent == null) { userAgent = String.format("%s.%s", USER_AGENT_BASE, FacebookSdkVersion.BUILD); // For the unity sdk we need to append the unity user agent String customUserAgent = InternalSettings.getCustomUserAgent(); if (!Utility.isNullOrEmpty(customUserAgent)) { userAgent = String.format(Locale.ROOT, "%s/%s", userAgent, customUserAgent); }/*from ww w . j a v a 2s. co m*/ } return userAgent; }
From source file:org.apache.manifoldcf.crawler.connectors.rss.RSSConnector.java
/** Process a set of documents. * This is the method that should cause each document to be fetched, processed, and the results either added * to the queue of documents for the current job, and/or entered into the incremental ingestion manager. * The document specification allows this class to filter what is done based on the job. * The connector will be connected before this method can be called. *@param documentIdentifiers is the set of document identifiers to process. *@param statuses are the currently-stored document versions for each document in the set of document identifiers * passed in above.// w ww.j a v a 2s . com *@param activities is the interface this method should use to queue up new document references * and ingest documents. *@param jobMode is an integer describing how the job is being run, whether continuous or once-only. *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one. */ @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { getSession(); // The connection limit is designed to permit this connector to coexist with potentially other connectors, such as the web connector. // There is currently no good way to enforce connection limits across all installed connectors - this will require considerably more // thought to set up properly. int connectionLimit = 200; String[] fixedList = new String[2]; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: In getDocumentVersions for " + Integer.toString(documentIdentifiers.length) + " documents"); Filter f = new Filter(spec, false); String[] acls = f.getAcls(); // Sort it, java.util.Arrays.sort(acls); // NOTE: There are two kinds of documents in here; documents that are RSS feeds (that presumably have a content-type // of text/xml), and documents that need to be indexed. // // For the latter, the metadata etc is part of the version string. For the former, the only thing that is part of the version string is the // document's checksum. // // The need to exclude documents from fetch based on whether they match an expression causes some difficulties, because we really // DON'T want this to apply to the feeds themselves. Since the distinguishing characteristic of a feed is that it is in the seed list, // and that its content-type is text/xml, we could use either of these characteristics to treat feeds differently from // fetchable urls. But the latter approach requires a fetch, which is forbidden. So - the spec will be used to characterize the url. // However, the spec might change, and the url might be dropped from the list - and then what?? // // The final solution is to simply not queue what cannot be mapped. int feedTimeout = f.getFeedTimeoutValue(); // The document specification has already been used to trim out documents that are not // allowed from appearing in the queue. So, even that has already been done. for (String documentIdentifier : documentIdentifiers) { // If it is in this list, we presume that it has been vetted against the map etc., so we don't do that again. We just fetch it. // And, if the content type is xml, we calculate the version as if it is a feed rather than a document. // Get the url String urlValue = documentIdentifier; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Getting version string for '" + urlValue + "'"); String versionString; String ingestURL = null; String[] pubDates = null; String[] sources = null; String[] titles = null; String[] authorNames = null; String[] authorEmails = null; String[] categories = null; String[] descriptions = null; try { // If there's a carrydown "data" value for this url, we use that value rather than actually fetching the document. This also means we don't need to // do a robots check, because we aren't actually crawling anything. So, ALWAYS do this first... CharacterInput[] dechromedData = activities.retrieveParentDataAsFiles(urlValue, "data"); try { if (dechromedData.length > 0) { // Data already available. The fetch cycle can be entirely avoided, as can the robots check. ingestURL = f.mapDocumentURL(urlValue); if (ingestURL != null) { // Open up an input stream corresponding to the carrydown data. The stream will be encoded as utf-8. try { InputStream is = dechromedData[0].getUtf8Stream(); try { StringBuilder sb = new StringBuilder(); long checkSum = cache.addData(activities, urlValue, "text/html", is); // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { throw new ManifoldCFException( "IO exception reading data from string: " + e.getMessage(), e); } } else { // Document a seed or unmappable; just skip if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping carry-down document '" + urlValue + "' because it is unmappable or is a seed."); } } else { // Get the old version string String oldVersionString = statuses.getIndexedVersionString(documentIdentifier); // Unpack the old version as much as possible. // We are interested in what the ETag and Last-Modified headers were last time. String lastETagValue = null; String lastModifiedValue = null; // Note well: Non-continuous jobs cannot use etag because the rss document MUST be fetched each time for such jobs, // or the documents it points at would get deleted. // // NOTE: I disabled this code because we really need the feed's TTL value in order to reschedule properly. I can't get the // TTL value without refetching the document - therefore ETag and Last-Modified cannot be used :-( if (false && jobMode == JOBMODE_CONTINUOUS && oldVersionString != null && oldVersionString.startsWith("-")) { // It's a feed, so the last etag and last-modified fields should be encoded in this version string. StringBuilder lastETagBuffer = new StringBuilder(); int unpackPos = unpack(lastETagBuffer, oldVersionString, 1, '+'); StringBuilder lastModifiedBuffer = new StringBuilder(); unpackPos = unpack(lastModifiedBuffer, oldVersionString, unpackPos, '+'); if (lastETagBuffer.length() > 0) lastETagValue = lastETagBuffer.toString(); if (lastModifiedBuffer.length() > 0) lastModifiedValue = lastModifiedBuffer.toString(); } if (Logging.connectors.isDebugEnabled() && (lastETagValue != null || lastModifiedValue != null)) Logging.connectors.debug( "RSS: Document '" + urlValue + "' was found to have a previous ETag value of '" + ((lastETagValue == null) ? "null" : lastETagValue) + "' and a previous Last-Modified value of '" + ((lastModifiedValue == null) ? "null" : lastModifiedValue) + "'"); // Robots check. First, we need to separate the url into its components URL url; try { url = new URL(urlValue); } catch (MalformedURLException e) { Logging.connectors.debug("RSS: URL '" + urlValue + "' is malformed; skipping", e); activities.deleteDocument(documentIdentifier); continue; } String protocol = url.getProtocol(); int port = url.getPort(); String hostName = url.getHost(); String pathPart = url.getFile(); // Check with robots to see if it's allowed if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext, throttleGroupName, protocol, port, hostName, url.getPath(), userAgent, from, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities, connectionLimit)) { activities.recordActivity(null, ACTIVITY_FETCH, null, urlValue, Integer.toString(-2), "Robots exclusion", null); if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: Skipping url '" + urlValue + "' because robots.txt says to"); activities.deleteDocument(documentIdentifier); continue; } // Now, use the fetcher, and get the file. IThrottledConnection connection = fetcher.createConnection(currentContext, throttleGroupName, hostName, connectionLimit, feedTimeout, proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword, activities); try { // Begin the fetch connection.beginFetch("Data"); try { // Execute the request. // Use the connect timeout from the document specification! int status = connection.executeFetch(protocol, port, pathPart, userAgent, from, lastETagValue, lastModifiedValue); switch (status) { case IThrottledConnection.STATUS_NOCHANGE: versionString = oldVersionString; break; case IThrottledConnection.STATUS_OK: try { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Successfully fetched " + urlValue); // Document successfully fetched! // If its content is xml, presume it's a feed... String contentType = connection.getResponseHeader("Content-Type"); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } String strippedContentType = contentType; if (strippedContentType != null) { int pos = strippedContentType.indexOf(";"); if (pos != -1) strippedContentType = strippedContentType.substring(0, pos).trim(); } boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType)); ingestURL = null; if (!isXML) { // If the chromed content mode is set to "skip", and we got here, it means // we should not include the content. if (f.getChromedContentMode() == CHROMED_SKIP) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it no longer has dechromed content available"); versionString = null; break; } // Decide whether to exclude this document based on what we see here. // Basically, we want to get rid of everything that we don't know what // to do with in the ingestion system. if (!activities.checkMimeTypeIndexable(contentType)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing url '" + urlValue + "' because it had the wrong content type: " + ((contentType == null) ? "null" : "'" + contentType + "'")); versionString = null; break; } ingestURL = f.mapDocumentURL(urlValue); } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors .debug("RSS: The url '" + urlValue + "' is a feed"); if (!f.isSeed(urlValue)) { // Remove the feed from consideration, since it has left the list of seeds if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Removing feed url '" + urlValue + "' because it is not a seed."); versionString = null; break; } } InputStream is = connection.getResponseBodyStream(); try { long checkSum = cache.addData(activities, urlValue, contentType, is); StringBuilder sb = new StringBuilder(); if (ingestURL != null) { // We think it is ingestable. The version string accordingly starts with a "+". // Grab what we need from the passed-down data for the document. These will all become part // of the version string. pubDates = activities.retrieveParentData(urlValue, "pubdate"); sources = activities.retrieveParentData(urlValue, "source"); titles = activities.retrieveParentData(urlValue, "title"); authorNames = activities.retrieveParentData(urlValue, "authorname"); authorEmails = activities.retrieveParentData(urlValue, "authoremail"); categories = activities.retrieveParentData(urlValue, "category"); descriptions = activities.retrieveParentData(urlValue, "description"); java.util.Arrays.sort(pubDates); java.util.Arrays.sort(sources); java.util.Arrays.sort(titles); java.util.Arrays.sort(authorNames); java.util.Arrays.sort(authorEmails); java.util.Arrays.sort(categories); java.util.Arrays.sort(descriptions); if (sources.length == 0) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Warning; URL '" + ingestURL + "' doesn't seem to have any RSS feed source!"); } sb.append('+'); packList(sb, acls, '+'); if (acls.length > 0) { sb.append('+'); pack(sb, defaultAuthorityDenyToken, '+'); } else sb.append('-'); // The ingestion URL pack(sb, ingestURL, '+'); // The pub dates packList(sb, pubDates, '+'); // The titles packList(sb, titles, '+'); // The sources packList(sb, sources, '+'); // The categories packList(sb, categories, '+'); // The descriptions packList(sb, descriptions, '+'); // The author names packList(sb, authorNames, '+'); // The author emails packList(sb, authorEmails, '+'); } else { sb.append('-'); String etag = connection.getResponseHeader("ETag"); if (etag == null) pack(sb, "", '+'); else pack(sb, etag, '+'); String lastModified = connection.getResponseHeader("Last-Modified"); if (lastModified == null) pack(sb, "", '+'); else pack(sb, lastModified, '+'); } // Do the checksum part, which does not need to be parseable. sb.append(new Long(checkSum).toString()); versionString = sb.toString(); } finally { is.close(); } } catch (java.net.SocketTimeoutException e) { Logging.connectors .warn("RSS: Socket timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (ConnectTimeoutException e) { Logging.connectors .warn("RSS: Connecto timeout exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } catch (InterruptedIOException e) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (IOException e) { Logging.connectors.warn("RSS: IO exception fetching document contents '" + urlValue + "' - skipping: " + e.getMessage(), e); versionString = null; } break; case IThrottledConnection.STATUS_SITEERROR: case IThrottledConnection.STATUS_PAGEERROR: default: // Record an *empty* version. // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times. versionString = ""; break; } } finally { connection.doneFetch(activities); } } finally { connection.close(); } if (versionString == null) { activities.deleteDocument(documentIdentifier); continue; } if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))) continue; // Process document! if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Processing '" + urlValue + "'"); // The only links we extract come from documents that we think are RSS feeds. // When we think that's the case, we attempt to parse it as RSS XML. if (ingestURL == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting document '" + urlValue + "' as a feed"); // We think it is a feed. // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't // want to attempt to process it in any case. // // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds. if (true || jobMode != JOBMODE_CONTINUOUS) { handleRSSFeedSAX(urlValue, activities, f); if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Extraction of feed '" + urlValue + "' complete"); // Record the feed's version string, so we won't refetch unless needed. // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to // keep track of the adaptive parameters. activities.recordDocument(documentIdentifier, versionString); } else { // The problem here is that we really do need to set the rescan time to something reasonable. // But we might not even have read the feed! So what to do?? // One answer is to build a connector-specific table that carries the last value of every feed around. // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified). if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Feed '" + urlValue + "' does not appear to differ from previous fetch for a continuous job; not extracting!"); long currentTime = System.currentTimeMillis(); Long defaultRescanTime = f.getDefaultRescanTime(currentTime); if (defaultRescanTime != null) { Long minimumTime = f.getMinimumRescanTime(currentTime); if (minimumTime != null) { if (defaultRescanTime.longValue() < minimumTime.longValue()) defaultRescanTime = minimumTime; } } activities.setDocumentScheduleBounds(urlValue, defaultRescanTime, defaultRescanTime, null, null); } } else { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Interpreting '" + urlValue + "' as a document"); String errorCode = null; String errorDesc = null; long startTime = System.currentTimeMillis(); Long fileLengthLong = null; try { long documentLength = cache.getDataLength(documentIdentifier); if (!activities.checkLengthIndexable(documentLength)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Document rejected because of length (" + documentLength + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its length was rejected (" + documentLength + ")"); continue; } if (!activities.checkURLIndexable(documentIdentifier)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_URL; errorDesc = "Document rejected because of URL ('" + documentIdentifier + "')"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its URL was rejected ('" + documentIdentifier + "')"); continue; } // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } if (!activities.checkMimeTypeIndexable(contentType)) { activities.noDocument(documentIdentifier, versionString); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Document rejected because of mime type (" + contentType + ")"; if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("RSS: Skipping document '" + urlValue + "' because its mime type was rejected ('" + contentType + "')"); continue; } // Treat it as an ingestable document. long dataSize = cache.getDataLength(urlValue); RepositoryDocument rd = new RepositoryDocument(); // Set content type if (contentType != null) rd.setMimeType(contentType); // Turn into acls and add into description String[] denyAcls; if (acls == null) denyAcls = null; else if (acls.length == 0) denyAcls = new String[0]; else denyAcls = new String[] { defaultAuthorityDenyToken }; if (acls != null && denyAcls != null) rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls); if (titles != null && titles.length > 0) rd.addField("title", titles); if (authorNames != null && authorNames.length > 0) rd.addField("authorname", authorNames); if (authorEmails != null && authorEmails.length > 0) rd.addField("authoremail", authorEmails); if (descriptions != null && descriptions.length > 0) rd.addField("summary", descriptions); if (sources != null && sources.length > 0) rd.addField("source", sources); if (categories != null && categories.length > 0) rd.addField("category", categories); // The pubdates are a ms since epoch value; we want the minimum one for the origination time. Long minimumOrigTime = null; if (pubDates != null && pubDates.length > 0) { String[] pubDateValuesISO = new String[pubDates.length]; TimeZone tz = TimeZone.getTimeZone("UTC"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'"); df.setTimeZone(tz); for (int k = 0; k < pubDates.length; k++) { String pubDate = pubDates[k]; try { Long pubDateLong = new Long(pubDate); if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue()) minimumOrigTime = pubDateLong; pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue())); } catch (NumberFormatException e) { // Do nothing; the version string seems to not mean anything pubDateValuesISO[k] = ""; } } rd.addField("pubdate", pubDates); rd.addField("pubdateiso", pubDateValuesISO); } if (minimumOrigTime != null) activities.setDocumentOriginationTime(urlValue, minimumOrigTime); InputStream is = cache.getData(urlValue); if (is != null) { try { rd.setBinary(is, dataSize); try { activities.ingestDocumentWithException(documentIdentifier, versionString, ingestURL, rd); errorCode = "OK"; fileLengthLong = new Long(dataSize); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "reading data"); } } finally { try { is.close(); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "closing stream"); } } } } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, null, urlValue, errorCode, errorDesc, null); } } } } finally { for (CharacterInput ci : dechromedData) { if (ci != null) ci.discard(); } } } finally { // Remove any fetched documents. cache.deleteData(documentIdentifier); } } }
From source file:org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector.java
protected void processDocument(IProcessActivity activities, String documentIdentifier, String versionString, boolean indexDocument, Map<String, Set<String>> metaHash, String[] acls, DocumentURLFilter filter) throws ManifoldCFException, ServiceInterruption { // Consider this document for ingestion. String errorCode = null;/*from w ww . j a v a 2s . co m*/ String errorDesc = null; Long fileLengthLong = null; long startTime = System.currentTimeMillis(); try { // We can exclude it if it does not seem to be a kind of document that the ingestion system knows // about. if (!indexDocument) { errorCode = "CONTENTNOTINDEXABLE"; errorDesc = "Content not indexable"; activities.noDocument(documentIdentifier, versionString); return; } int responseCode = cache.getResponseCode(documentIdentifier); if (responseCode != 200) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Web: For document '" + documentIdentifier + "', not indexing because response code not indexable: " + responseCode); errorCode = "RESPONSECODENOTINDEXABLE"; errorDesc = "HTTP response code not indexable (" + responseCode + ")"; activities.noDocument(documentIdentifier, versionString); return; } long dataLength = cache.getDataLength(documentIdentifier); if (!activities.checkLengthIndexable(dataLength)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Web: For document '" + documentIdentifier + "', not indexing because pipeline thinks length " + dataLength + " is not acceptable"); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Rejected due to length (" + dataLength + ")"; activities.noDocument(documentIdentifier, versionString); return; } if (activities.checkURLIndexable(documentIdentifier) == false) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Web: For document '" + documentIdentifier + "', not indexing because output connector does not want URL"); errorCode = activities.EXCLUDED_URL; errorDesc = "Rejected due to URL ('" + documentIdentifier + "')"; activities.noDocument(documentIdentifier, versionString); return; } String ingestURL = filter.isDocumentIndexable(documentIdentifier); if (ingestURL == null) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Web: For document '" + documentIdentifier + "', not indexing because document does not match web job constraints"); errorCode = "JOBRESTRICTION"; errorDesc = "Rejected because job excludes this URL ('" + documentIdentifier + "')"; activities.noDocument(documentIdentifier, versionString); return; } // Check if it's a recognized content type String contentType = cache.getContentType(documentIdentifier); // Some sites have multiple content types. We just look at the LAST one in that case. if (contentType != null) { String[] contentTypes = contentType.split(","); if (contentTypes.length > 0) contentType = contentTypes[contentTypes.length - 1].trim(); else contentType = null; } if (contentType != null) { int pos = contentType.indexOf(";"); if (pos != -1) contentType = contentType.substring(0, pos); contentType = contentType.trim(); } if (!activities.checkMimeTypeIndexable(contentType)) { if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("Web: For document '" + documentIdentifier + "', not indexing because output connector does not want mime type '" + contentType + "'"); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Rejected because of mime type (" + contentType + ")"; activities.noDocument(documentIdentifier, versionString); return; } // Ingest the document if (Logging.connectors.isDebugEnabled()) Logging.connectors.debug("WEB: Decided to ingest '" + documentIdentifier + "'"); RepositoryDocument rd = new RepositoryDocument(); // Set the file name String fileName = ""; try { fileName = documentIdentifiertoFileName(documentIdentifier); } catch (URISyntaxException e1) { fileName = ""; } if (fileName.length() > 0) { rd.setFileName(fileName); } // Set the content type String mimeType = cache.getContentType(documentIdentifier); if (mimeType != null) rd.setMimeType(mimeType); // Turn into acls and add into description String[] denyAcls; if (acls == null) denyAcls = null; else { if (acls.length > 0) denyAcls = new String[] { defaultAuthorityDenyToken }; else denyAcls = new String[0]; } if (acls != null && denyAcls != null) rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls, denyAcls); // Grab metadata for (String key : metaHash.keySet()) { Set<String> metaList = metaHash.get(key); String[] values = new String[metaList.size()]; int k = 0; for (String value : metaList) { values[k++] = value; } rd.addField(key, values); } InputStream is = cache.getData(documentIdentifier); if (is != null) { try { rd.setBinary(is, dataLength); try { activities.ingestDocumentWithException(documentIdentifier, versionString, ingestURL, rd); errorCode = "OK"; fileLengthLong = new Long(dataLength); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "reading data"); } } finally { try { is.close(); } catch (IOException e) { errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); errorDesc = e.getMessage(); handleIOException(e, "closing stream"); } } } else Logging.connectors.error( "WEB: Expected a cached document for '" + documentIdentifier + "', but none present!"); } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, fileLengthLong, documentIdentifier, errorCode, errorDesc, null); } }