Login

ivantcholakov · 04-21-2015, 11:03 AM

@mwhitney

Q
"So you chose a corporate extension to multiple existing standards, but didn't actually follow the format provided? Examples: Brazilian Portuguese in CLDR is pt_BR, Latin American Spanish is es_419."

A
I took the values from the JSON files, codes there are with dashes (pt-BR, es-419), here is copy/paste of the English one:

Code:
{

  "main": {

    "en": {

      "identity": {

        "version": {

          "_cldrVersion": "26",

          "_number": "$Revision: 10887 $"

        },

        "generation": {

          "_date": "$Date: 2014-08-31 14:23:52 -0500 (Sun, 31 Aug 2014) $"

        },

        "language": "en"

      },

      "localeDisplayNames": {

        "languages": {

          "aa": "Afar",

          "ab": "Abkhazian",

          "ace": "Achinese",

          "ach": "Acoli",

          "ada": "Adangme",

          "ady": "Adyghe",

          "ae": "Avestan",

          "aeb": "Tunisian Arabic",

          "af": "Afrikaans",

          "afh": "Afrihili",

          "agq": "Aghem",

          "ain": "Ainu",

          "ak": "Akan",

          "akk": "Akkadian",

          "akz": "Alabama",

          "ale": "Aleut",

          "aln": "Gheg Albanian",

          "alt": "Southern Altai",

          "am": "Amharic",

          "an": "Aragonese",

          "ang": "Old English",

          "anp": "Angika",

          "ar": "Arabic",

          "ar-001": "Modern Standard Arabic",

          "arc": "Aramaic",

          "arn": "Mapuche",

          "aro": "Araona",

          "arp": "Arapaho",

          "arq": "Algerian Arabic",

          "arw": "Arawak",

          "ary": "Moroccan Arabic",

          "arz": "Egyptian Arabic",

          "as": "Assamese",

          "asa": "Asu",

          "ase": "American Sign Language",

          "ast": "Asturian",

          "av": "Avaric",

          "avk": "Kotava",

          "awa": "Awadhi",

          "ay": "Aymara",

          "az": "Azerbaijani",

          "az-alt-short": "Azeri",

          "azb": "South Azerbaijani",

          "ba": "Bashkir",

          "bal": "Baluchi",

          "ban": "Balinese",

          "bar": "Bavarian",

          "bas": "Basaa",

          "bax": "Bamun",

          "bbc": "Batak Toba",

          "bbj": "Ghomala",

          "be": "Belarusian",

          "bej": "Beja",

          "bem": "Bemba",

          "bew": "Betawi",

          "bez": "Bena",

          "bfd": "Bafut",

          "bfq": "Badaga",

          "bg": "Bulgarian",

          "bho": "Bhojpuri",

          "bi": "Bislama",

          "bik": "Bikol",

          "bin": "Bini",

          "bjn": "Banjar",

          "bkm": "Kom",

          "bla": "Siksika",

          "bm": "Bambara",

          "bn": "Bengali",

          "bo": "Tibetan",

          "bpy": "Bishnupriya",

          "bqi": "Bakhtiari",

          "br": "Breton",

          "bra": "Braj",

          "brh": "Brahui",

          "brx": "Bodo",

          "bs": "Bosnian",

          "bss": "Akoose",

          "bua": "Buriat",

          "bug": "Buginese",

          "bum": "Bulu",

          "byn": "Blin",

          "byv": "Medumba",

          "ca": "Catalan",

          "cad": "Caddo",

          "car": "Carib",

          "cay": "Cayuga",

          "cch": "Atsam",

          "ce": "Chechen",

          "ceb": "Cebuano",

          "cgg": "Chiga",

          "ch": "Chamorro",

          "chb": "Chibcha",

          "chg": "Chagatai",

          "chk": "Chuukese",

          "chm": "Mari",

          "chn": "Chinook Jargon",

          "cho": "Choctaw",

          "chp": "Chipewyan",

          "chr": "Cherokee",

          "chy": "Cheyenne",

          "ckb": "Sorani Kurdish",

          "co": "Corsican",

          "cop": "Coptic",

          "cps": "Capiznon",

          "cr": "Cree",

          "crh": "Crimean Turkish",

          "cs": "Czech",

          "csb": "Kashubian",

          "cu": "Church Slavic",

          "cv": "Chuvash",

          "cy": "Welsh",

          "da": "Danish",

          "dak": "Dakota",

          "dar": "Dargwa",

          "dav": "Taita",

          "de": "German",

          "de-AT": "Austrian German",

          "de-CH": "Swiss High German",

          "del": "Delaware",

          "den": "Slave",

          "dgr": "Dogrib",

          "din": "Dinka",

          "dje": "Zarma",

          "doi": "Dogri",

          "dsb": "Lower Sorbian",

          "dtp": "Central Dusun",

          "dua": "Duala",

          "dum": "Middle Dutch",

          "dv": "Divehi",

          "dyo": "Jola-Fonyi",

          "dyu": "Dyula",

          "dz": "Dzongkha",

          "dzg": "Dazaga",

          "ebu": "Embu",

          "ee": "Ewe",

          "efi": "Efik",

          "egl": "Emilian",

          "egy": "Ancient Egyptian",

          "eka": "Ekajuk",

          "el": "Greek",

          "elx": "Elamite",

          "en": "English",

          "en-AU": "Australian English",

          "en-CA": "Canadian English",

          "en-GB": "British English",

          "en-GB-alt-short": "U.K. English",

          "en-US": "American English",

          "en-US-alt-short": "U.S. English",

          "enm": "Middle English",

          "eo": "Esperanto",

          "es": "Spanish",

          "es-419": "Latin American Spanish",

          "es-ES": "European Spanish",

          "es-MX": "Mexican Spanish",

          "esu": "Central Yupik",

          "et": "Estonian",

          "eu": "Basque",

          "ewo": "Ewondo",

          "ext": "Extremaduran",

          "fa": "Persian",

          "fan": "Fang",

          "fat": "Fanti",

          "ff": "Fulah",

          "fi": "Finnish",

          "fil": "Filipino",

          "fit": "Tornedalen Finnish",

          "fj": "Fijian",

          "fo": "Faroese",

          "fon": "Fon",

          "fr": "French",

          "fr-CA": "Canadian French",

          "fr-CH": "Swiss French",

          "frc": "Cajun French",

          "frm": "Middle French",

          "fro": "Old French",

          "frp": "Arpitan",

          "frr": "Northern Frisian",

          "frs": "Eastern Frisian",

          "fur": "Friulian",

          "fy": "Western Frisian",

          "ga": "Irish",

          "gaa": "Ga",

          "gag": "Gagauz",

          "gan": "Gan Chinese",

          "gay": "Gayo",

          "gba": "Gbaya",

          "gbz": "Zoroastrian Dari",

          "gd": "Scottish Gaelic",

          "gez": "Geez",

          "gil": "Gilbertese",

          "gl": "Galician",

          "glk": "Gilaki",

          "gmh": "Middle High German",

          "gn": "Guarani",

          "goh": "Old High German",

          "gom": "Goan Konkani",

          "gon": "Gondi",

          "gor": "Gorontalo",

          "got": "Gothic",

          "grb": "Grebo",

          "grc": "Ancient Greek",

          "gsw": "Swiss German",

          "gu": "Gujarati",

          "guc": "Wayuu",

          "gur": "Frafra",

          "guz": "Gusii",

          "gv": "Manx",

          "gwi": "Gwichʼin",

          "ha": "Hausa",

          "hai": "Haida",

          "hak": "Hakka Chinese",

          "haw": "Hawaiian",

          "he": "Hebrew",

          "hi": "Hindi",

          "hif": "Fiji Hindi",

          "hil": "Hiligaynon",

          "hit": "Hittite",

          "hmn": "Hmong",

          "ho": "Hiri Motu",

          "hr": "Croatian",

          "hsb": "Upper Sorbian",

          "hsn": "Xiang Chinese",

          "ht": "Haitian",

          "hu": "Hungarian",

          "hup": "Hupa",

          "hy": "Armenian",

          "hz": "Herero",

          "ia": "Interlingua",

          "iba": "Iban",

          "ibb": "Ibibio",

          "id": "Indonesian",

          "ie": "Interlingue",

          "ig": "Igbo",

          "ii": "Sichuan Yi",

          "ik": "Inupiaq",

          "ilo": "Iloko",

          "inh": "Ingush",

          "io": "Ido",

          "is": "Icelandic",

          "it": "Italian",

          "iu": "Inuktitut",

          "izh": "Ingrian",

          "ja": "Japanese",

          "jam": "Jamaican Creole English",

          "jbo": "Lojban",

          "jgo": "Ngomba",

          "jmc": "Machame",

          "jpr": "Judeo-Persian",

          "jrb": "Judeo-Arabic",

          "jut": "Jutish",

          "jv": "Javanese",

          "ka": "Georgian",

          "kaa": "Kara-Kalpak",

          "kab": "Kabyle",

          "kac": "Kachin",

          "kaj": "Jju",

          "kam": "Kamba",

          "kaw": "Kawi",

          "kbd": "Kabardian",

          "kbl": "Kanembu",

          "kcg": "Tyap",

          "kde": "Makonde",

          "kea": "Kabuverdianu",

          "ken": "Kenyang",

          "kfo": "Koro",

          "kg": "Kongo",

          "kgp": "Kaingang",

          "kha": "Khasi",

          "kho": "Khotanese",

          "khq": "Koyra Chiini",

          "khw": "Khowar",

          "ki": "Kikuyu",

          "kiu": "Kirmanjki",

          "kj": "Kuanyama",

          "kk": "Kazakh",

          "kkj": "Kako",

          "kl": "Kalaallisut",

          "kln": "Kalenjin",

          "km": "Khmer",

          "kmb": "Kimbundu",

          "kn": "Kannada",

          "ko": "Korean",

          "koi": "Komi-Permyak",

          "kok": "Konkani",

          "kos": "Kosraean",

          "kpe": "Kpelle",

          "kr": "Kanuri",

          "krc": "Karachay-Balkar",

          "kri": "Krio",

          "krj": "Kinaray-a",

          "krl": "Karelian",

          "kru": "Kurukh",

          "ks": "Kashmiri",

          "ksb": "Shambala",

          "ksf": "Bafia",

          "ksh": "Colognian",

          "ku": "Kurdish",

          "kum": "Kumyk",

          "kut": "Kutenai",

          "kv": "Komi",

          "kw": "Cornish",

          "ky": "Kyrgyz",

          "ky-alt-variant": "Kirghiz",

          "la": "Latin",

          "lad": "Ladino",

          "lag": "Langi",

          "lah": "Lahnda",

          "lam": "Lamba",

          "lb": "Luxembourgish",

          "lez": "Lezghian",

          "lfn": "Lingua Franca Nova",

          "lg": "Ganda",

          "li": "Limburgish",

          "lij": "Ligurian",

          "liv": "Livonian",

          "lkt": "Lakota",

          "lmo": "Lombard",

          "ln": "Lingala",

          "lo": "Lao",

          "lol": "Mongo",

          "loz": "Lozi",

          "lt": "Lithuanian",

          "ltg": "Latgalian",

          "lu": "Luba-Katanga",

          "lua": "Luba-Lulua",

          "lui": "Luiseno",

          "lun": "Lunda",

          "luo": "Luo",

          "lus": "Mizo",

          "luy": "Luyia",

          "lv": "Latvian",

          "lzh": "Literary Chinese",

          "lzz": "Laz",

          "mad": "Madurese",

          "maf": "Mafa",

          "mag": "Magahi",

          "mai": "Maithili",

          "mak": "Makasar",

          "man": "Mandingo",

          "mas": "Masai",

          "mde": "Maba",

          "mdf": "Moksha",

          "mdr": "Mandar",

          "men": "Mende",

          "mer": "Meru",

          "mfe": "Morisyen",

          "mg": "Malagasy",

          "mga": "Middle Irish",

          "mgh": "Makhuwa-Meetto",

          "mgo": "Metaʼ",

          "mh": "Marshallese",

          "mi": "Maori",

          "mic": "Micmac",

          "min": "Minangkabau",

          "mk": "Macedonian",

          "ml": "Malayalam",

          "mn": "Mongolian",

          "mnc": "Manchu",

          "mni": "Manipuri",

          "moh": "Mohawk",

          "mos": "Mossi",

          "mr": "Marathi",

          "mrj": "Western Mari",

          "ms": "Malay",

          "mt": "Maltese",

          "mua": "Mundang",

          "mul": "Multiple Languages",

          "mus": "Creek",

          "mwl": "Mirandese",

          "mwr": "Marwari",

          "mwv": "Mentawai",

          "my": "Burmese",

          "mye": "Myene",

          "myv": "Erzya",

          "mzn": "Mazanderani",

          "na": "Nauru",

          "nan": "Min Nan Chinese",

          "nap": "Neapolitan",

          "naq": "Nama",

          "nb": "Norwegian Bokmål",

          "nd": "North Ndebele",

          "nds": "Low German",

          "ne": "Nepali",

          "new": "Newari",

          "ng": "Ndonga",

          "nia": "Nias",

          "niu": "Niuean",

          "njo": "Ao Naga",

          "nl": "Dutch",

          "nl-BE": "Flemish",

          "nmg": "Kwasio",

          "nn": "Norwegian Nynorsk",

          "nnh": "Ngiemboon",

          "no": "Norwegian",

          "nog": "Nogai",

          "non": "Old Norse",

          "nov": "Novial",

          "nqo": "NʼKo",

          "nr": "South Ndebele",

          "nso": "Northern Sotho",

          "nus": "Nuer",

          "nv": "Navajo",

          "nwc": "Classical Newari",

          "ny": "Nyanja",

          "nym": "Nyamwezi",

          "nyn": "Nyankole",

          "nyo": "Nyoro",

          "nzi": "Nzima",

          "oc": "Occitan",

          "oj": "Ojibwa",

          "om": "Oromo",

          "or": "Oriya",

          "os": "Ossetic",

          "osa": "Osage",

          "ota": "Ottoman Turkish",

          "pa": "Punjabi",

          "pag": "Pangasinan",

          "pal": "Pahlavi",

          "pam": "Pampanga",

          "pap": "Papiamento",

          "pau": "Palauan",

          "pcd": "Picard",

          "pdc": "Pennsylvania German",

          "pdt": "Plautdietsch",

          "peo": "Old Persian",

          "pfl": "Palatine German",

          "phn": "Phoenician",

          "pi": "Pali",

          "pl": "Polish",

          "pms": "Piedmontese",

          "pnt": "Pontic",

          "pon": "Pohnpeian",

          "prg": "Prussian",

          "pro": "Old Provençal",

          "ps": "Pashto",

          "ps-alt-variant": "Pushto",

          "pt": "Portuguese",

          "pt-BR": "Brazilian Portuguese",

          "pt-PT": "European Portuguese",

          "qu": "Quechua",

          "quc": "Kʼicheʼ",

          "qug": "Chimborazo Highland Quichua",

          "raj": "Rajasthani",

          "rap": "Rapanui",

          "rar": "Rarotongan",

          "rgn": "Romagnol",

          "rif": "Riffian",

          "rm": "Romansh",

          "rn": "Rundi",

          "ro": "Romanian",

          "ro-MD": "Moldavian",

          "rof": "Rombo",

          "rom": "Romany",

          "root": "Root",

          "rtm": "Rotuman",

          "ru": "Russian",

          "rue": "Rusyn",

          "rug": "Roviana",

          "rup": "Aromanian",

          "rw": "Kinyarwanda",

          "rwk": "Rwa",

          "sa": "Sanskrit",

          "sad": "Sandawe",

          "sah": "Sakha",

          "sam": "Samaritan Aramaic",

          "saq": "Samburu",

          "sas": "Sasak",

          "sat": "Santali",

          "saz": "Saurashtra",

          "sba": "Ngambay",

          "sbp": "Sangu",

          "sc": "Sardinian",

          "scn": "Sicilian",

          "sco": "Scots",

          "sd": "Sindhi",

          "sdc": "Sassarese Sardinian",

          "se": "Northern Sami",

          "see": "Seneca",

          "seh": "Sena",

          "sei": "Seri",

          "sel": "Selkup",

          "ses": "Koyraboro Senni",

          "sg": "Sango",

          "sga": "Old Irish",

          "sgs": "Samogitian",

          "sh": "Serbo-Croatian",

          "shi": "Tachelhit",

          "shn": "Shan",

          "shu": "Chadian Arabic",

          "si": "Sinhala",

          "sid": "Sidamo",

          "sk": "Slovak",

          "sl": "Slovenian",

          "sli": "Lower Silesian",

          "sly": "Selayar",

          "sm": "Samoan",

          "sma": "Southern Sami",

          "smj": "Lule Sami",

          "smn": "Inari Sami",

          "sms": "Skolt Sami",

          "sn": "Shona",

          "snk": "Soninke",

          "so": "Somali",

          "sog": "Sogdien",

          "sq": "Albanian",

          "sr": "Serbian",

          "srn": "Sranan Tongo",

          "srr": "Serer",

          "ss": "Swati",

          "ssy": "Saho",

          "st": "Southern Sotho",

          "stq": "Saterland Frisian",

          "su": "Sundanese",

          "suk": "Sukuma",

          "sus": "Susu",

          "sux": "Sumerian",

          "sv": "Swedish",

          "sw": "Swahili",

          "swb": "Comorian",

          "swc": "Congo Swahili",

          "syc": "Classical Syriac",

          "syr": "Syriac",

          "szl": "Silesian",

          "ta": "Tamil",

          "tcy": "Tulu",

          "te": "Telugu",

          "tem": "Timne",

          "teo": "Teso",

          "ter": "Tereno",

          "tet": "Tetum",

          "tg": "Tajik",

          "th": "Thai",

          "ti": "Tigrinya",

          "tig": "Tigre",

          "tiv": "Tiv",

          "tk": "Turkmen",

          "tkl": "Tokelau",

          "tkr": "Tsakhur",

          "tl": "Tagalog",

          "tlh": "Klingon",

          "tli": "Tlingit",

          "tly": "Talysh",

          "tmh": "Tamashek",

          "tn": "Tswana",

          "to": "Tongan",

          "tog": "Nyasa Tonga",

          "tpi": "Tok Pisin",

          "tr": "Turkish",

          "tru": "Turoyo",

          "trv": "Taroko",

          "ts": "Tsonga",

          "tsd": "Tsakonian",

          "tsi": "Tsimshian",

          "tt": "Tatar",

          "ttt": "Muslim Tat",

          "tum": "Tumbuka",

          "tvl": "Tuvalu",

          "tw": "Twi",

          "twq": "Tasawaq",

          "ty": "Tahitian",

          "tyv": "Tuvinian",

          "tzm": "Central Atlas Tamazight",

          "udm": "Udmurt",

          "ug": "Uyghur",

          "ug-alt-variant": "Uighur",

          "uga": "Ugaritic",

          "uk": "Ukrainian",

          "umb": "Umbundu",

          "und": "Unknown Language",

          "ur": "Urdu",

          "uz": "Uzbek",

          "vai": "Vai",

          "ve": "Venda",

          "vec": "Venetian",

          "vep": "Veps",

          "vi": "Vietnamese",

          "vls": "West Flemish",

          "vmf": "Main-Franconian",

          "vo": "Volapük",

          "vot": "Votic",

          "vro": "Võro",

          "vun": "Vunjo",

          "wa": "Walloon",

          "wae": "Walser",

          "wal": "Wolaytta",

          "war": "Waray",

          "was": "Washo",

          "wo": "Wolof",

          "wuu": "Wu Chinese",

          "xal": "Kalmyk",

          "xh": "Xhosa",

          "xmf": "Mingrelian",

          "xog": "Soga",

          "yao": "Yao",

          "yap": "Yapese",

          "yav": "Yangben",

          "ybb": "Yemba",

          "yi": "Yiddish",

          "yo": "Yoruba",

          "yrl": "Nheengatu",

          "yue": "Cantonese",

          "za": "Zhuang",

          "zap": "Zapotec",

          "zbl": "Blissymbols",

          "zea": "Zeelandic",

          "zen": "Zenaga",

          "zgh": "Standard Moroccan Tamazight",

          "zh": "Chinese",

          "zh-Hans": "Simplified Chinese",

          "zh-Hant": "Traditional Chinese",

          "zu": "Zulu",

          "zun": "Zuni",

          "zxx": "No linguistic content",

          "zza": "Zaza"

        }

      }

    }

  }

}

-------------

Q
"Why did you choose to use the identifier for Taiwan in the latter part of the URI segment for Traditional Chinese?"

A
http://en.wikipedia.org/wiki/Traditional...characters - "They are most commonly the characters in the standardized character sets of Taiwan, of Hong Kong and Macau or in the Kangxi Dictionary."

-----------------------

Q
"Using the U.K. flag for U.S. English is one of my favorite things to see in someone's code when they're commenting on the handling of languages by English-speaking people."

A
It is not recommendable using flags for language identification at all, but I am doing this because of tradition, clients might push me this way. Consider the "flag" field as a customization, it is not to be implemented within a framework, the set of the flag images are not to be put within the framework too. And, in North America you can use the American flag, in Europe you can use the English flag if flags are to be shown.

---------------------------

"These examples point out some of the issues I see with trying to integrate this into the core. Ideally, the Lang class itself would have better handling of standard language tags (http://www.w3.org/International/articles/language-tags/). If they could be handled properly by the Lang class, it would be much easier to modify the routing to properly handle language tags."

Re:
CLDR-codes are used by intl PHP extension, this is why I prefer them. I think, there are no significant differences with http://www.w3.org/International/articles/language-tags/
(dashes instead of underscores there too).

----------------------------------

"At that point, the system should be able to handle language tags in any number of formats, preferably with configurable choices between '-', '_', and '/' for the separators used between language tags and subtags as well as configuration of default language choices for the general tags as well as the lack of language tags, since all of these impact the ability to use those language tags with varying standards and extensions (or for use in a URL routing scheme)."

Re:
I agree with this, but to some degree, in this area it is easy to come to something quite complicated. Routing by language segment implementation actually is the easy part.

Dashes or underscores - this is solvable, it is not quite important now.

The tricky part is how external components (PHP or JavaScripts) identify languages, well, they do this in different ways:
PHPMailer - https://github.com/PHPMailer/PHPMailer/t...r/language
CKEditor - https://github.com/ckeditor/ckeditor-dev...aster/lang
Datatables - https://github.com/DataTables/Plugins/tree/master/i18n
etc.

Translation between language identifiers is inevitable. This is how I approach to this, an example:

Code:
'portuguese' => array(

        'code' => 'pt',

        'direction' => 'ltr',

        'uri_segment' => 'pt',

        'name' => 'Português',

        'name_en' => 'Portuguese',

        'flag' => 'PT',

    ),

    'portuguese-brazilian' => array(

        'code' => 'pt-BR',

        'direction' => 'ltr',

        'uri_segment' => 'pt-br',

        'name' => 'Português do Brasil',

        'name_en' => 'Brazilian Portuguese',

        'flag' => 'BR',

        'phpmailer' => 'br',

    ),

For Brazilian Portuguese PHPMailer uses a non-standard language identifier, so I extend the configuration for this language with an additional field "phpmailer" that holds this non-standard id. Of course, the default framework configuration will not have such data, it could support this logic however.

Poll: URI language identifier in core? You do not have permission to vote in this poll.
yes	52.38%	22	52.38%
no	26.19%	11	26.19%
maybe	21.43%	9	21.43%
Total		42 vote(s)	100%