# -*- coding: utf_8 -*- # # Include this dependency: https://www.mediawiki.org/wiki/Manual:Pywikibot/Installation#Download_Pywikibot_with_SVN # import sys sys.path.append('./core') import pywikibot import traceback import os import re from time import strftime #language information lang_info ={'en': {'name':u'English', 'localname':u'English', 'weight': 1.0}, 'de': {'name':u'German', 'localname':u'Deutsch', 'weight':1.0}, 'fr': {'name':u'French', 'localname':u'Français', 'weight':1.0}, 'pl': {'name':u'Polish', 'localname':u'Polski', 'weight':1.1}, 'ja': {'name':u'Japanese', 'localname':u'日本語', 'weight':1.9}, 'it': {'name':u'Italian', 'localname':u'Italiano', 'weight':1.1}, 'nl': {'name':u'Dutch', 'localname':u'Nederlands', 'weight':0.9}, 'pt': {'name':u'Portuguese', 'localname':u'Português', 'weight':1.1}, 'es': {'name':u'Spanish', 'localname':u'Español', 'weight':1.1}, 'sv': {'name':u'Swedish', 'localname':u'Svenska', 'weight':1.1}, 'ru': {'name':u'Russian', 'localname':u'Русский', 'weight':1.4}, 'zh': {'name':u'Chinese', 'localname':u'中文', 'weight':3.7}, 'no': {'name':u'Norwegian (Bokmål)','localname':u'Norsk (Bokmål)', 'weight':1.2}, 'fi': {'name':u'Finnish', 'localname':u'Suomi', 'weight':1.1}, 'vo': {'name':u'Volapük', 'localname':u'Volapük'}, 'ca': {'name':u'Catalan', 'localname':u'Català', 'weight':1.1}, 'ro': {'name':u'Romanian', 'localname':u'Română', 'weight':1.1}, 'tr': {'name':u'Turkish', 'localname':u'Türkçe', 'weight':1.3}, 'uk': {'name':u'Ukrainian', 'localname':u'Українська', 'weight':1.3}, 'eo': {'name':u'Esperanto', 'localname':u'Esperanto', 'weight':1.1}, 'cs': {'name':u'Czech', 'localname':u'Čeština', 'weight':1.3}, 'hu': {'name':u'Hungarian', 'localname':u'Magyar', 'weight':1.1}, 'sk': {'name':u'Slovak', 'localname':u'Slovenčina', 'weight':1.3}, 'da': {'name':u'Danish', 'localname':u'Dansk', 'weight':1.2}, 'id': {'name':u'Indonesian', 'localname':u'Bahasa Indonesia', 'weight':0.9}, 'he': {'name':u'Hebrew', 'localname':u'עברית', 'weight':1.2}, 'lt': {'name':u'Lithuanian', 'localname':u'Lietuvių', 'weight':1.2}, 'sr': {'name':u'Serbian', 'localname':u'Српски / Srpski', 'weight':1.4}, 'sl': {'name':u'Slovenian', 'localname':u'Slovenščina', 'weight':1.2}, 'ko': {'name':u'Korean', 'localname':u'한국어', 'weight':2.5}, 'ar': {'name':u'Arabic', 'localname':u'العربية', 'weight':1.0}, 'bg': {'name':u'Bulgarian', 'localname':u'Български', 'weight':1.1}, 'et': {'name':u'Estonian', 'localname':u'Eesti', 'weight':1.2}, 'hr': {'name':u'Croatian', 'localname':u'Hrvatski', 'weight':1.3}, 'new':{'name':u'Newar / Nepal Bhasa','localname':u'नेपाल भाषा'}, 'te': {'name':u'Telugu', 'localname':u'తెలుగు'}, 'vi': {'name':u'Vietnamese', 'localname':u'Tiếng Việt', 'weight':1.1}, 'th': {'name':u'Thai', 'localname':u'ไทย', 'weight':1.0}, 'gl': {'name':u'Galician', 'localname':u'Galego', 'weight':1.2}, 'fa': {'name':u'Persian', 'localname':u'فارسی', 'weight':1.2}, 'nn': {'name':u'Norwegian (Nynorsk)','localname':u'Nynorsk', 'similar_lang':'no'}, 'ceb':{'name':u'Cebuano', 'localname':u'Sinugboanong Binisaya', 'weight':0.8}, 'el': {'name':u'Greek', 'localname':u'Ελληνικά', 'weight':1.1}, 'ms': {'name':u'Malay', 'localname':u'Bahasa Melayu', 'weight':1.0}, 'simple':{'name':u'Simple English','localname':u'Simple English'}, 'eu': {'name':u'Basque', 'localname':u'Euskara', 'weight':1.1}, 'bpy':{'name':u'Bishnupriya Manipuri','localname':u'ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী'}, 'bs': {'name':u'Bosnian', 'localname':u'Bosanski', 'similar_lang':'hr'}, 'lb': {'name':u'Luxembourgish','localname':u'Lëtzebuergesch'}, 'is': {'name':u'Icelandic', 'localname':u'Íslenska', 'weight':1.1}, 'ka': {'name':u'Georgian', 'localname':u'ქართული'}, 'sq': {'name':u'Albanian', 'localname':u'Shqip'}, 'la': {'name':u'Latin', 'localname':u'Latina', 'weight':1.1}, 'br': {'name':u'Breton', 'localname':u'Brezhoneg'}, 'az': {'name':u'Azeri', 'localname':u'Azərbaycan', 'weight':1.2}, 'hi': {'name':u'Hindi', 'localname':u'हिन्दी', 'weight':1.0}, 'bn': {'name':u'Bengali', 'localname':u'বাংলা'}, 'ht': {'name':u'Haitian', 'localname':u'Krèyol ayisyen'}, 'mk': {'name':u'Macedonian', 'localname':u'Македонски', 'weight':1.3}, 'mr': {'name':u'Marathi', 'localname':u'मराठी'}, 'sh': {'name':u'Serbo-Croatian','localname':u'Srpskohrvatski / Српскохрватски', 'similar_lang':'hr'}, 'tl': {'name':u'Tagalog', 'localname':u'Tagalog'}, 'io': {'name':u'Ido', 'localname':u'Ido'}, 'cy': {'name':u'Welsh', 'localname':u'Cymraeg', 'weight':1.2}, 'pms':{'name':u'Piedmontese', 'localname':u'Piemontèis'}, 'lv': {'name':u'Latvian', 'localname':u'Latviešu', 'weight':1.1}, 'su': {'name':u'Sundanese', 'localname':u'Basa Sunda'}, 'ta': {'name':u'Tamil', 'localname':u'தமிழ்', 'weight':0.9}, 'jv': {'name':u'Javanese', 'localname':u'Basa Jawa'}, 'nap':{'name':u'Neapolitan', 'localname':u'Nnapulitano'}, 'oc': {'name':u'Occitan', 'localname':u'Occitan'}, 'nds':{'name':u'Low Saxon', 'localname':u'Plattdüütsch'}, 'scn':{'name':u'Sicilian', 'localname':u'Sicilianu'}, 'ast':{'name':u'Asturian', 'localname':u'Asturianu'}, 'ku': {'name':u'Kurdish', 'localname':u'Kurdî / كوردی'}, 'be': {'name':u'Belarusian', 'localname':u'Беларуская', 'similar_lang':'be-x-old'}, 'wa': {'name':u'Walloon', 'localname':u'Walon'}, 'af': {'name':u'Afrikaans', 'localname':u'Afrikaans', 'weight':1.0}, 'be-x-old':{'name':u'Belarusian (Taraškievica)','localname':u'Беларуская (тарашкевіца)', 'weight':1.4}, 'tg': {'name':u'Tajik', 'localname':u'Тоҷикӣ'}, 'an': {'name':u'Aragonese', 'localname':u'Aragonés', 'weight':1.1}, 'fy': {'name':u'West Frisian','localname':u'Frysk'}, 'vec':{'name':u'Venetian', 'localname':u'Vèneto'}, 'roa-tara':{'name':u'Tarantino', 'localname':u'Tarandíne'}, 'cv': {'name':u'Chuvash', 'localname':u'Чăваш'}, 'zh-yue':{'name':u'Cantonese', 'localname':u'粵語', 'similar_lang':'zh'}, 'ur': {'name':u'Urdu', 'localname':u'اردو'}, 'ksh':{'name':u'Ripuarian', 'localname':u'Ripoarisch'}, 'sw': {'name':u'Swahili', 'localname':u'Kiswahili'}, 'qu': {'name':u'Quechua', 'localname':u'Runa Simi'}, 'uz': {'name':u'Uzbek', 'localname':u'O‘zbek'}, 'mi': {'name':u'Maori', 'localname':u'Māori'}, 'ga': {'name':u'Irish', 'localname':u'Gaeilge'}, 'bat-smg':{'name':u'Samogitian', 'localname':u'Žemaitėška'}, 'ml': {'name':u'Malayalam', 'localname':u'മലയാളം', 'weight':1.1}, 'gd': {'name':u'Scottish Gaelic','localname':u'Gàidhlig'}, 'yo': {'name':u'Yoruba', 'localname':u'Yorùbá'}, 'co': {'name':u'Corsican', 'localname':u'Corsu'}, 'kn': {'name':u'Kannada', 'localname':u'ಕನ್ನಡ', 'weight':0.9}, 'pam':{'name':u'Kapampangan', 'localname':u'Kapampangan'}, 'yi': {'name':u'Yiddish', 'localname':u'ייִדיש'}, 'hsb':{'name':u'Upper Sorbian','localname':u'Hornjoserbsce'}, 'nah':{'name':u'Nahuatl', 'localname':u'Nāhuatl'}, 'ia': {'name':u'Interlingua', 'localname':u'Interlingua', 'weight':1.0}, 'li': {'name':u'Limburgian', 'localname':u'Limburgs'}, 'sa': {'name':u'Sanskrit', 'localname':u'संस्कृतम्'}, 'hy': {'name':u'Armenian', 'localname':u'Հայերեն', 'weight':1.2}, 'tt': {'name':u'Tatar', 'localname':u'Tatarça / Татарча'}, 'als':{'name':u'Alemannic', 'localname':u'Alemannisch', 'weight':1.1}, 'roa-rup':{'name':u'Aromanian', 'localname':u'Armãneashce'}, 'lmo':{'name':u'Lombard', 'localname':u'Lumbaart'}, 'map-bms':{'name':u'Banyumasan', 'localname':u'Basa Banyumasan'}, 'am': {'name':u'Amharic', 'localname':u'አማርኛ'}, 'nrm':{'name':u'Norman', 'localname':u'Nouormand/Normaund'}, 'zh-min-nan':{'name':u'Min Nan', 'localname':u'Bân-lâm-gú', 'weight':1.2}, 'pag':{'name':u'Pangasinan', 'localname':u'Pangasinan'}, 'wuu':{'name':u'Wu', 'localname':u'吴语', 'similar_lang':'zh'}, 'fo': {'name':u'Faroese', 'localname':u'Føroyskt'}, 'vls':{'name':u'West Flemish','localname':u'West-Vlams'}, 'nds-nl':{'name':u'Dutch Low Saxon','localname':u'Nedersaksisch'}, 'se': {'name':u'Northern Sami','localname':u'Sámegiella'}, 'rm': {'name':u'Romansh', 'localname':u'Rumantsch'}, 'ne': {'name':u'Nepali', 'localname':u'नेपाली'}, 'war':{'name':u'Waray-Waray', 'localname':u'Winaray'}, 'fur':{'name':u'Friulian', 'localname':u'Furlan'}, 'lij':{'name':u'Ligurian', 'localname':u'Líguru'}, 'nov':{'name':u'Novial', 'localname':u'Novial'}, 'bh': {'name':u'Bihari', 'localname':u'भोजपुरी'}, 'sco':{'name':u'Scots', 'localname':u'Scots'}, 'dv': {'name':u'Divehi', 'localname':u'ދިވެހިބަސް'}, 'pi': {'name':u'Pali', 'localname':u'पाऴि'}, 'diq':{'name':u'Zazaki', 'localname':u'Zazaki'}, 'ilo':{'name':u'Ilokano', 'localname':u'Ilokano'}, 'kk': {'name':u'Kazakh', 'localname':u'Қазақша', 'weight':1.3}, 'os': {'name':u'Ossetian', 'localname':u'Иронау'}, 'frp':{'name':u'Franco-Provençal/Arpitan','localname':u'Arpitan'}, 'zh-classical':{'name':u'Classical Chinese','localname':u'古文 / 文言文', 'similar_lang':'zh'}, 'mt': {'name':u'Maltese', 'localname':u'Malti'}, 'lad':{'name':u'Ladino', 'localname':u'Dzhudezmo'}, 'fiu-vro':{'name':u'Võro', 'localname':u'Võro'}, 'pdc':{'name':u'Pennsylvania German','localname':u'Deitsch'}, 'csb':{'name':u'Kashubian', 'localname':u'Kaszëbsczi'}, 'kw': {'name':u'Cornish', 'localname':u'Kernewek'}, 'bar':{'name':u'Bavarian', 'localname':u'Boarisch'}, 'to': {'name':u'Tongan', 'localname':u'faka Tonga'}, 'haw':{'name':u'Hawaiian', 'localname':u'Hawai`i'}, 'mn': {'name':u'Mongolian', 'localname':u'Монгол'}, 'ps': {'name':u'Pashto', 'localname':u'پښتو'}, 'ang':{'name':u'Anglo-Saxon', 'localname':u'Englisc'}, 'km': {'name':u'Khmer', 'localname':u'ភាសាខ្មែរ'}, 'gv': {'name':u'Manx', 'localname':u'Gaelg'}, 'tk': {'name':u'Turkmen', 'localname':u'تركمن / Туркмен'}, 'ln': {'name':u'Lingala', 'localname':u'Lingala'}, 'ie': {'name':u'Interlingue', 'localname':u'Interlingue'}, 'tpi':{'name':u'Tok Pisin', 'localname':u'Tok Pisin'}, 'crh':{'name':u'Crimean Tatar','localname':u'Qırımtatarca'}, 'jbo':{'name':u'Lojban', 'localname':u'Lojban', 'weight':1.2}, 'wo': {'name':u'Wolof', 'localname':u'Wolof'}, 'ay': {'name':u'Aymara', 'localname':u'Aymar'}, 'zea':{'name':u'Zealandic', 'localname':u'Zeêuws'}, 'eml':{'name':u'Emilian-Romagnol','localname':u'Emiliàn e rumagnòl'}, 'si': {'name':u'Sinhalese', 'localname':u'සිංහල'}, 'sc': {'name':u'Sardinian', 'localname':u'Sardu'}, 'or': {'name':u'Oriya', 'localname':u'ଓଡ଼ିଆ'}, 'ig': {'name':u'Igbo', 'localname':u'Igbo'}, 'mg': {'name':u'Malagasy', 'localname':u'Malagasy'}, 'cbk-zam':{'name':u'Zamboanga Chavacano','localname':u'Chavacano de Zamboanga'}, 'gu': {'name':u'Gujarati', 'localname':u'ગુજરાતી'}, 'ky': {'name':u'Kirghiz', 'localname':u'Кыргызча'}, 'kg': {'name':u'Kongo', 'localname':u'KiKongo'}, 'ty': {'name':u'Tahitian', 'localname':u'Reo Mā`ohi'}, 'glk':{'name':u'Gilaki', 'localname':u'گیلکی'}, 'arc':{'name':u'Assyrian Neo-Aramaic','localname':u'ܐܪܡܝܐ'}, 'mo': {'name':u'Moldovan', 'localname':u'Молдовеняскэ'}, 'gn': {'name':u'Guarani', 'localname':u'Avañe\'ẽ'}, 'kab':{'name':u'Kabyle', 'localname':u'Taqbaylit'}, 'so': {'name':u'Somali', 'localname':u'Soomaaliga'}, 'ks': {'name':u'Kashmiri', 'localname':u'कश्मीरी / كشميري'}, 'stq':{'name':u'Saterland Frisian','localname':u'Seeltersk'}, 'mzn':{'name':u'Mazandarani', 'localname':u'مَزِروني'}, 'cu': {'name':u'Old Church Slavonic','localname':u'Словѣньскъ'}, 'ce': {'name':u'Chechen', 'localname':u'Нохчийн'}, 'udm':{'name':u'Udmurt', 'localname':u'Удмурт кыл'}, 'tet':{'name':u'Tetum', 'localname':u'Tetun'}, 'sd': {'name':u'Sindhi', 'localname':u'سنڌي، سندھی ، सिन्ध'}, 'pap':{'name':u'Papiamentu', 'localname':u'Papiamentu'}, 'ba': {'name':u'Bashkir', 'localname':u'Башҡорт'}, 'pa': {'name':u'Punjabi', 'localname':u'ਪੰਜਾਬੀ'}, 'rmy':{'name':u'Romani', 'localname':u'romani - रोमानी'}, 'lo': {'name':u'Lao', 'localname':u'ລາວ'}, 'na': {'name':u'Nauruan', 'localname':u'dorerin Naoero'}, 'bcl':{'name':u'Central Bicolano','localname':u'Bikol'}, 'kaa':{'name':u'Karakalpak', 'localname':u'Qaraqalpaq tili'}, 'gan':{'name':u'Gan', 'localname':u'贛語', 'similar_lang':'zh'}, 'iu': {'name':u'Inuktitut', 'localname':u'ᐃᓄᒃᑎᑐᑦ'}, 'myv':{'name':u'Erzya', 'localname':u'Эрзянь (Erzjanj Kelj)'}, 'szl':{'name':u'Silesian', 'localname':u'Ślůnski'}, 'sah':{'name':u'Sakha', 'localname':u'Саха тыла (Saxa Tyla)'}, 'my': {'name':u'Burmese', 'localname':u'Burmese'}, 'ext':{'name':u'Extremaduran','localname':u'Estremeñu'}, 'hif':{'name':u'Fiji Hindi', 'localname':u'Fiji Hindi'}, 'bo': {'name':u'Tibetan', 'localname':u'བོད་སྐད་'}, 'srn':{'name':u'Sranan', 'localname':u'Sranantongo'}, 'got':{'name':u'Gothic', 'localname':u'ðミフᄇðミフ﾿ðミヘトðミフᄚðミヘツðミフᄚðミフᄊðミフᄈðミフᄚ'}, 'dsb':{'name':u'Lower Sorbian','localname':u'Dolnoserbšćina'}, 'bm': {'name':u'Bambara', 'localname':u'Bamanankan'}, 'sm': {'name':u'Samoan', 'localname':u'Gagana Samoa'}, 'cdo':{'name':u'Min Dong', 'localname':u'Mìng-dĕ̤ng-ngṳ̄'}, 'chr':{'name':u'Cherokee', 'localname':u'ᏣᎳᎩ ᎧᏬᏂᎯᏍᏗ'}, 'mdf':{'name':u'Moksha', 'localname':u'Мокшень (Mokshanj Kälj)'}, 'om': {'name':u'Oromo', 'localname':u'Oromoo'}, 'ee': {'name':u'Ewe', 'localname':u'Eʋegbe'}, 'as': {'name':u'Assamese', 'localname':u'অসমীয়া ভাষা আৰু লিপি'}, 'ti': {'name':u'Tigrinya', 'localname':u'ትግርኛ_ፊደል'}, 'ug': {'name':u'Uyghur', 'localname':u'Oyghurque'}, 'kv': {'name':u'Komi', 'localname':u'Коми'}, 'zu': {'name':u'Zulu', 'localname':u'IsiZulu'}, 'av': {'name':u'Avar', 'localname':u'Авар'}, 'nv': {'name':u'Navajo', 'localname':u'Diné bizaad'}, 'ss': {'name':u'Swati', 'localname':u'SiSwati'}, 'pih':{'name':u'Norfolk', 'localname':u'Norfuk'}, 'ts': {'name':u'Tsonga', 'localname':u'Xitsonga'}, 'cr': {'name':u'Cree', 'localname':u'Nehiyaw'}, 've': {'name':u'Venda', 'localname':u'TshiVenda'}, 'ch': {'name':u'Chamorro', 'localname':u'Chamoru'}, 'bi': {'name':u'Bislama', 'localname':u'Bislama'}, 'xh': {'name':u'Xhosa', 'localname':u'IsiXhosa'}, 'rw': {'name':u'Kinyarwanda', 'localname':u'Kinyarwanda'}, 'dz': {'name':u'Dzongkha', 'localname':u'རྫོང་ཁ་'}, 'tn': {'name':u'Tswana', 'localname':u'Setswana'}, 'kl': {'name':u'Greenlandic', 'localname':u'Kalaallisut'}, 'bug':{'name':u'Buginese', 'localname':u'Basa Ugi'}, 'ik': {'name':u'Inupiak', 'localname':u'Iñupiak uqautchit'}, 'bxr':{'name':u'Buryat (Russia)','localname':u'Буряад'}, 'st': {'name':u'Sesotho', 'localname':u'Sesotho'}, 'xal':{'name':u'Kalmyk', 'localname':u'Хальмг келн'}, 'ny': {'name':u'Chichewa', 'localname':u'Chicheŵa'}, 'ak': {'name':u'Akan', 'localname':u'Akana'}, 'ab': {'name':u'Abkhazian', 'localname':u'Аҧсуа бызшәа'}, 'fj': {'name':u'Fijian', 'localname':u'Na Vosa Vakaviti'}, 'lg': {'name':u'Luganda', 'localname':u'Luganda'}, 'tw': {'name':u'Twi', 'localname':u'Twi'}, 'ha': {'name':u'Hausa', 'localname':u'هَوُسَ'}, 'za': {'name':u'Zhuang', 'localname':u'Sawcuengh'}, 'ff': {'name':u'Fula', 'localname':u'Fulfulde'}, 'lbe':{'name':u'Lak', 'localname':u'Лакку маз'}, 'ki': {'name':u'Kikuyu', 'localname':u'Gĩgĩkũyũ'}, 'sn': {'name':u'Shona', 'localname':u'ChiShona'}, 'tum':{'name':u'Tumbuka', 'localname':u'ChiTumbuka'}, 'sg': {'name':u'Sango', 'localname':u'Sängö'}, 'ii': {'name':u'Sichuan Yi', 'localname':u'ꆇꉙ'}, 'chy':{'name':u'Cheyenne', 'localname':u'Tsetsêhestâhese'}, 'rn': {'name':u'Kirundi', 'localname':u'Kirundi'}, 'cho':{'name':u'Choctaw', 'localname':u'Chahta Anumpa'}, 'mh': {'name':u'Marshallese', 'localname':u'Kajin M̧ajeļ'}, 'aa': {'name':u'Afar', 'localname':u'Afar'}, 'ng': {'name':u'Ndonga', 'localname':u'Oshiwambo'}, 'kj': {'name':u'Kuanyama', 'localname':u'Kuanyama'}, 'ho': {'name':u'Hiri Motu', 'localname':u'Hiri Motu'}, 'mus':{'name':u'Muscogee', 'localname':u'Muskogee'}, 'kr': {'name':u'Kanuri', 'localname':u' Kanuri'}, 'hz': {'name':u'Herero', 'localname':u'Otsiherero'}, # 'tokipona':{'name':u'Tokipona', 'localname':u'Tokipona'}, 'arz':{'name':u'Egyptian Arabic', 'localname':u'مصرى (Maṣrī)', 'similar_lang':'ar'}, 'pnt':{'name':u'Pontic', 'localname':u'Ποντιακά', 'similar_lang':'el'}, 'mhr':{'name':u'Meadow Mari', 'localname':u'Олык Марий'}, 'ace':{'name':u'Acehnese', 'localname':u'Acèh'}, 'ckb':{'name':u'Soranî', 'localname':u'Soranî / کوردی'}, 'mwl':{'name':u'Mirandese', 'localname':u'Mirandés'}, 'pnb':{'name':u'Western Panjabi', 'localname':u'پنجابی'}, 'pcd':{'name':u'Picard', 'localname':u'Picard'}, 'krc':{'name':u'Karachay-Balkar', 'localname':u'Къарачай-Малкъар'}, 'frr':{'name':u'North Frisian', 'localname':u'Nordfriisk'}, 'bjn':{'name':u'Banjar', 'localname':u'Bahasa Banjar'}, 'mrj':{'name':u'Hill Mari', 'localname':u'Кырык Мары (Kyryk Mary)'}, 'koi':{'name':u'Komi-Permyak', 'localname':u'Перем Коми (Perem Komi)'}, 'gag':{'name':u'Gagauz', 'localname':u'Gagauz'}, 'pfl':{'name':u'Palatinate German','localname':u'Pfälzisch'}, 'rue':{'name':u'Rusyn', 'localname':u'русиньскый язык'}, 'ltg':{'name':u'Latgalian', 'localname':u'Latgaļu volūda'}, 'kbd':{'name':u'Kabardian', 'localname':u'Aдыгэбзэ'}, 'xmf':{'name':u'Mingrelian', 'localname':u'მარგალური'}, 'nso':{'name':u'Northern Sotho', 'localname':u'Sesotho sa Leboa'}, 'vep':{'name':u'Veps', 'localname':u'Vepsän kel\''}, 'lez':{'name':u'Lezgi', 'localname':u'Лезги'}, 'min':{'name':u'Minangkabau', 'localname':u'Minangkabau'}, 'tyv':{'name':u'Tuva', 'localname':u'Тыва дыл'}, 'hak':{'name':u'Hakka', 'localname':u'Hak-kâ-fa / 客家話'}, 'mai':{'name':u'Maithili', 'localname':u'मैथिली'}} #languages to process lang_keys = ['en','tyv'] lang_keys = lang_info.keys() lang_keys.sort() #optimize by caching iw-links iw_cache = {} en_labels = {} #debug max_words = -1 prev_score = {} #score colors color10000 = 'BF5FFF' color4000 = 'FF7F00' color2000 = 'FFBE00' color1000 = 'FFFF00' color500 = 'BEFF00' color250 = '40FF00' color100 = '00FF7D' color0 = 'EFEFEF' #format with spaces def FormatNumber(s): r = [] for i, c in enumerate(reversed(str(int(s)))): if i and i % 3 == 0: r.insert(0, ',') r.insert(0, c) return ''.join(r) def GetPreviousScores(): temp_path = "PreviousScores.txt" if os.path.isfile(temp_path): temp_file = open(temp_path) for line in temp_file: tokens = line.split() prev_score[tokens[0]] = float(tokens[1]) temp_file.close() def GetArticle(item, wiki, lang): word = GetArticleInterwikiName(item, lang) if len(word) > 0: page = pywikibot.Page(wiki, word) article = page.get(get_redirect=True) if u'#REDIRECT' in article.upper(): text_start = article.find('[[') text_end = article.find(']]', text_start) word = article[text_start+2:text_end] page = pywikibot.Page(wiki, word) article = page.get() else: article = '' return article def GetArticleInterwikiName(item, lang): if item in iw_cache: iw_links = iw_cache[item] else: wikidata = pywikibot.Site('en', 'wikipedia').data_repository() datapage = pywikibot.ItemPage(wikidata, item) datapage.get() iw_links = datapage.sitelinks labels = datapage.labels iw_cache[item] = iw_links if u'en' in labels: en_labels[item] = labels[u'en'] lang_wiki = lang.replace("-","_") + u'wiki' if lang_wiki in iw_links: return iw_links[lang_wiki] else: return '' def GetInterwikiLength(article): #calculate len of all interwiki links interwiki_len = 0 interwiki_last = 0 interwiki_colon = 0 interwiki_nl = 0 interwiki_first = article.find(u'[[', interwiki_last) while interwiki_first > -1: interwiki_last = article.find(u']]', interwiki_first) interwiki_colon = article.find(u':', interwiki_first) if interwiki_colon > -1 and interwiki_colon < interwiki_last: curlang = article[interwiki_first+2:interwiki_colon] if curlang in lang_info: interwiki_nl = article.find(u'\n', interwiki_last) if interwiki_nl > -1: interwiki_len += (interwiki_nl - interwiki_first) + 1 else: interwiki_len += (interwiki_last - interwiki_first) + 2 interwiki_first = article.find(u'[[', interwiki_last) return interwiki_len def GetCommentLength(article): #calculate len of all comments comment_len = 0 comment_last = 0 comment_first = article.find(u'', comment_first) if comment_last == -1: comment_last = comment_first + 4 comment_len += (comment_last - comment_first) - 4 comment_first = article.find(u'') article = comments.sub("", article) #remove references refs = re.compile(r'') article = refs.sub("", article) # convert article to lower case word list word_list = article.lower().split() if len(word_list) == 0: return False # create dictionary of word:frequency pairs freq_dic = {} # punctuation marks to be removed punctuation = re.compile(r'[.?!,":;]') for word in word_list: word = punctuation.sub("", word) if word in freq_dic: freq_dic[word] += 1 else: freq_dic[word] = 1 # usually English is ~30% these words and non-English at most a few percent common_english_words = ['the','of','on','a','is','in','his','have','by','but','that','to','with','for', 'from''are','was','he','which','be','as','it','this','first', 'new', 'and', 'she','also','after','at','become','best','from','had','great', 'into','their', 'these','they','time','who','her','not','one','or', 'made', 'would','are','between'] en_word_count = 0 for word in common_english_words: if word in freq_dic: en_word_count += freq_dic[word] percent_thats_common_english = 100.0 * en_word_count / len(word_list) # flag if 20% or more in the list which means more than half the article is English if percent_thats_common_english > 20 and en_word_count > 20: print "Percent %f, %d out of %d" % (percent_thats_common_english, en_word_count, len(word_list)) return True return False def GetArticleType(wt_article_size): if wt_article_size == 0: return 'absent' elif 0 < wt_article_size < 10000: return 'stubs' elif 10000 <= wt_article_size < 30000: return 'articles' elif wt_article_size >= 30000: return 'longarticles' def GetScoreForLang(lang): absent = lang_info[lang]['absent'] stubs = lang_info[lang]['stubs'] articles = lang_info[lang]['articles'] longarticles = lang_info[lang]['longarticles'] return GetScore(absent, stubs, articles, longarticles) def GetScore(absent, stubs, articles, longarticles): max_count = absent + stubs + articles + longarticles max_score = max_count * 9 raw_score = stubs + (articles*4) + (longarticles*9) if max_score > 0: score = 100.0 * raw_score / max_score else: score = 0 return score def GetLink(subtable,lang,value): return '[[/'+subtable+'#' + lang +' '+lang_info[lang]['localname']+ '|' + value + ']]' def GetTableNumber(count, min_subtable_count, max_subtable_count0, subtable, lang, max_subtable_count40=0): value = FormatNumber(count) max_subtable_count = max_subtable_count0 if GetScoreForLang(lang) > 40 and max_subtable_count40 > 0: max_subtable_count = max_subtable_count40 if count >= min_subtable_count and (count <= max_subtable_count or max_subtable_count==-1): return GetLink(subtable,lang,value) else: return value num_lang = 0 def CalculateStatistics(): for lang in lang_keys: CalculateStatisticsForLang(lang) def GetWeightForLang(lang): lang_weight = 1.0 if 'weight' in lang_info[lang]: lang_weight = lang_info[lang]['weight'] elif 'similar_lang' in lang_info[lang]: lang_weight = lang_info[lang_info[lang]['similar_lang']]['weight'] return lang_weight def CalculateStatisticsForLang(lang): global num_lang num_lang += 1 print ('=['+lang+' '+str(num_lang)+ '/' + str(len(lang_keys)) + ']').ljust(76,'=') try: lang_info[lang]['total_size'] = 0 lang_info[lang]['absent'] = 0 lang_info[lang]['stubs'] = 0 lang_info[lang]['articles'] = 0 lang_info[lang]['longarticles'] = 0 lang_info[lang]['art_count'] = 0 temp_path = "~%s_output.txt" % (lang) if os.path.isfile(temp_path): temp_file = open(temp_path) art_count = int(temp_file.readline()) lang_info[lang]['art_count'] = art_count for index in range(art_count): artKey = 'art_'+str(index) lang_info[lang][artKey] = {} lang_info[lang][artKey]['item'] = temp_file.readline().decode('utf_8').strip() lang_info[lang][artKey]['name'] = temp_file.readline().decode('utf_8').strip() lang_info[lang][artKey]['size'] = int(temp_file.readline()) lang_info[lang][artKey]['error'] = temp_file.readline().decode('utf_8').strip() temp_file.close() print '..using previous %s result...' % (lang) else: wiki = pywikibot.Site(lang, 'wikipedia') item_file = open("ItemList.txt") word_count = 0 for line in item_file: word_count += 1 if word_count > max_words > 0: break item = line[:-1].decode('utf_8') article_size = 0 error = '' try: article = GetArticle(item, wiki, lang) raw_article_size = len(article) interwiki_len = GetInterwikiLength(article) comment_len = GetCommentLength(article) article_size = (raw_article_size - interwiki_len - comment_len) if lang != "en" and lang != 'simple' and lang != 'sco' and IsArticleEnglish(article): raise TypeError ("Wrong language, [[%s:%s]] has too much untranslated English." % (lang, GetArticleInterwikiName(item, lang).encode("utf-8"))) lang_weight = GetWeightForLang(lang) print str(lang).ljust(3), str(word_count).rjust(3), item.ljust(30), print str(article_size * lang_weight).rjust(11), str(lang_weight).rjust(5), str(interwiki_len).rjust(9), str(comment_len).rjust(9) except KeyboardInterrupt: sys.exit(1) except Exception: e = sys.exc_info()[1] sys.stderr.write('\n') traceback.print_exc() sys.stderr.write('\n') try: error = CookString(unicode(str(e),'utf-8')) except: error = "Error." art_index = lang_info[lang]['art_count'] artKey = 'art_'+str(art_index) lang_info[lang][artKey] = {} lang_info[lang][artKey]['item'] = item if item in en_labels: lang_info[lang][artKey]['name'] = en_labels[item] else: lang_info[lang][artKey]['name'] = item lang_info[lang][artKey]['size'] = article_size lang_info[lang][artKey]['error'] = error lang_info[lang]['art_count'] = art_index + 1 item_file.close() temp_file = open(temp_path,'w') temp_file.write(str(lang_info[lang]['art_count'])+'\n') for index in range(lang_info[lang]['art_count']): artKey = 'art_'+str(index) temp_file.write(lang_info[lang][artKey]['item'].encode('utf_8')+'\n') temp_file.write(lang_info[lang][artKey]['name'].encode('utf_8')+'\n') temp_file.write(str(lang_info[lang][artKey]['size'])+'\n') temp_file.write(lang_info[lang][artKey]['error'].encode('utf_8')+'\n') temp_file.close() for index in range(lang_info[lang]['art_count']): artKey = 'art_'+str(index) article_size = lang_info[lang][artKey]['size'] wt_article_size = article_size * GetWeightForLang(lang) article_type = GetArticleType(wt_article_size) if not lang_info[lang][artKey]['error']: lang_info[lang][article_type] = lang_info[lang][article_type] + 1 lang_info[lang]['total_size'] = lang_info[lang]['total_size'] + article_size except: sys.stderr.write('\n') traceback.print_exc() sys.stderr.write('\n') def GetGrowthNumber(lang, score): if lang in prev_score: return score - prev_score[lang] def GetGrowth(lang, score): if lang in prev_score: growth = "%+2.2f" % round(GetGrowthNumber(lang, score),2) else: growth = "n/a" if growth == '-0.00': growth = '+0.00' return growth def GetAverageSize(lang, article_count): if article_count > 0: avg_size = int(round(lang_info[lang]['total_size'] / article_count)) else: avg_size = 0 return int(avg_size * GetWeightForLang(lang)) def GetMedianSize(lang): x = [] art_count = lang_info[lang]['art_count'] for index in range(art_count): artKey = 'art_'+str(index) size = lang_info[lang][artKey]['size'] if size > 0: x.append(size) x.sort() mid = len(x)/2 median_size = 0 if len(x) > 0: if len(x) % 2: median_size = x[mid] else: median_size = (x[mid-1] + x[mid]) / 2 return int(median_size * GetWeightForLang(lang)) def PrintResults(): lang_keys.sort(key=GetScoreForLang, reverse=True) print '\n' print 'RESULTS\n----------------------------------------------------------------------' print u'Lang:',' AvgSize','Median','Absent',' <10k ','10-30k',' >30k ', 'Score', 'Growth' for lang in lang_keys: absent = lang_info[lang]['absent'] stubs = lang_info[lang]['stubs'] articles = lang_info[lang]['articles'] longarticles = lang_info[lang]['longarticles'] article_count = stubs + articles + longarticles score = GetScore(absent, stubs, articles, longarticles) growth = GetGrowth(lang, score) avg_size = GetAverageSize(lang, article_count) med_size = GetMedianSize(lang) print lang.ljust(6), print str(avg_size).rjust(7), print str(med_size).rjust(7), print str(absent).rjust(5), print str(stubs).rjust(6), print str(articles).rjust(6), print str(longarticles).rjust(6), print ("%6.2f" % score).rjust(6), print growth.rjust(6) def GetWikiTableResults(awards): lang_keys.sort(key=GetScoreForLang, reverse=True) table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"' table += '\n|-\n' table += u'!width = 45 | № !! width = 55 | Wiki !! width = 220 | Language !! width = 55 | [[Talk:List of Wikipedias by sample of articles/Archives/2007#Proposed weighting of characters for formula_.28Option.232_using_Babel_text.29|Weight]] !! width = 120 | Mean Article
Size !! width = 120 | [[Talk:List_of_Wikipedias_by_sample_of_articles#average_or_median.3F|Median Article
Size]] !! width = 80 | [[/Absent Articles|Absent]]
(0k) !! width=80| Stubs
(< 10k)!! width = 80 | Articles
(10-30k) !! width = 80 | Long Art.
(> 30k) !! width = 80 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Other possibility of maximum score|Score]]' table += '!! width = 50 | [[Talk:List of Wikipedias by sample of articles/Archives/2008#Script_extension|Growth]]' table += '\n|-\n' i=0 for lang in lang_keys: i += 1 absent = lang_info[lang]['absent'] stubs = lang_info[lang]['stubs'] articles = lang_info[lang]['articles'] longarticles = lang_info[lang]['longarticles'] article_count = stubs + articles + longarticles dagger = u'†' if absent + article_count == 0: lang_footnote = dagger absent = lang_info['en']['art_count'] else: lang_footnote = '' table += '|' + str(i) + '\n' table += '| [[:' + lang + ':|' + lang + ']]' + lang_footnote + '\n' table += '| style = "text-align: left" | [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]\n' if 'weight' in lang_info[lang]: weight = str(lang_info[lang]['weight']) elif 'similar_lang' in lang_info[lang]: weight = str(lang_info[lang_info[lang]['similar_lang']]['weight']) + '**' else: weight = '1.0*' score = GetScore(absent, stubs, articles, longarticles) growth = GetGrowth(lang, score) avg_size = GetAverageSize(lang, article_count) med_size = GetMedianSize(lang) if HasAwards(awards, lang): growth = GetLink('Growth',lang, growth) table += '| ' + weight + '\n' table += '| ' + GetTableNumber(avg_size, 1, -1,'Neglected', lang) + '\n' table += '| ' + FormatNumber(med_size) + '\n' table += '| ' + GetTableNumber(absent, 1, 250,'Absent Articles', lang) + '\n' table += '| ' + GetTableNumber(stubs, 1, 100,'Stubs', lang, 250) + '\n' table += '| ' + GetTableNumber(articles, 1, 100,'Articles', lang, 250) + '\n' table += '| ' + GetTableNumber(longarticles, 1, 100,'Long Articles', lang) + '\n' #color code score if score >= 100.00: color = "|style = \"background: "+u'\u0023'+color10000+"\"" elif score >= 40.00: color = "|style = \"background: "+u'\u0023'+color4000+"\"" elif score >= 20.00: color = "|style = \"background: "+u'\u0023'+color2000+"\"" elif score >= 10.00: color = "|style = \"background: "+u'\u0023'+color1000+"\"" elif score >= 5.00: color = "|style = \"background: "+u'\u0023'+color500+"\"" elif score >= 2.50: color = "|style = \"background: "+u'\u0023'+color250+"\"" elif score >= 1.00: color = "|style = \"background: "+u'\u0023'+color100+"\"" else: color = "|style = \"background: "+u'\u0023'+color0+"\"" table += color + '| ' + ("%.2f" % score) + '\n' table += '| ' + growth + '\n' table += '|-\n' table = table[:-2] + '}' return table def GetWikiTableArticles(article_type, min_articles, max_articles_0, max_articles_40=0): lang_keys.sort() table = u'' i=0 for lang in lang_keys: i += 1 count=0 max_articles = max_articles_0 score = GetScoreForLang(lang) if score > 40 and max_articles_40 > 0: max_articles = max_articles_40 section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n' for index in range(lang_info[lang]['art_count']): artKey = 'art_'+str(index) artWtSize = GetArticleSize(lang, artKey) artType = GetArticleType(artWtSize) if artType == article_type: section += '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info[lang][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n' count += 1 if min_articles <= count <= max_articles: table += section return table def GetArticleName(lang, artKey): if artKey in lang_info[lang]: return lang_info[lang][artKey]['name'] else: return 0 def GetArticleSize(lang, artKey): if artKey in lang_info[lang]: if lang_info[lang][artKey]['error'] : return 0; return lang_info[lang][artKey]['size'] * GetWeightForLang(lang) else: return 0 def GetEdgeFactor(lang, artKey): size = GetArticleSize(lang, artKey) if size==0: return 1 if 7000 < size < 1000: return (size - 7000) / 1000 if 24000 < size < 30000: return (size - 24000) / 1000 else: return 0 def GetRuntFactor(lang, artKey): size = GetArticleSize(lang, artKey) if size > 0: for index in range(lang_info['en']['art_count']): otherArtKey = 'art_'+str(index) if otherArtKey != artKey: otherSize = GetArticleSize(lang, otherArtKey) if 0 < otherSize < size: return 0 #you are not the runt return 4 return 0 def GetArticlePoints(lang, artKey): size = GetArticleSize(lang, artKey) if size > 0 and size < 10000: return 1 elif size > 10000 and size < 30000: return 4 elif size > 30000: return 9 return 0 def GetAverageArticlePoints(artKey): total = sum(GetArticlePoints(lang, artKey) for lang in lang_keys) return float(total) / len(lang_keys) def GetAverageArticleSize(artKey): total = sum(GetArticleSize(lang, artKey) for lang in lang_keys) return int(float(total) / len(lang_keys)) def GetNeglectForArticle(lang, artInfo): artKey = artInfo['artKey'] avgPnts = GetAverageArticlePoints(artKey) #0 to 9 pnts = GetArticlePoints(lang, artKey) #0 to 9 edgeFactor = GetEdgeFactor(lang, artKey) #0 to 6 runtFactor = GetRuntFactor(lang, artKey) #0 to 4 return avgPnts - pnts + edgeFactor + runtFactor def GetArticlesSortedByNeglect(lang): artInfos = [] if 'art_count' in lang_info['en']: for index in range(lang_info['en']['art_count']): artKey = 'art_'+str(index) artInfos.append( {} ) artInfos[index]['artKey'] = artKey artInfos[index]['popularity'] = GetAverageArticleSize(artKey) artInfos[index]['neglect'] = GetNeglectForArticle(lang, artInfos[index]) artInfos.sort(key=lambda x: (x['neglect'], x['popularity']), reverse=True) return artInfos def GetLargestArticles(artKey, maxLangs): lang_keys = lang_info.keys() lang_keys.sort(key=lambda lang: GetArticleSize(lang, artKey), reverse=True) item = lang_info['en'][artKey]['item'] ret = [] for lang in lang_keys[0:maxLangs]: ret.append ( '[['+lang+':'+GetArticleInterwikiName(item, lang)+'|'+lang+':'+FormatNumber(GetArticleSize(lang, artKey))+']]' ) return ' -- '.join(ret) def GetArticleTypeCount(artKey,points): return len([lang for lang in lang_keys if GetArticlePoints(lang, artKey) == points]) def GetNeglectedArticles(lang, max_articles): artInfos = GetArticlesSortedByNeglect(lang) i=0 table = u'' for artInfo in artInfos: if artInfo['artKey'] in lang_info[lang]: item = lang_info[lang][artInfo['artKey']]['item'] name = lang_info[lang][artInfo['artKey']]['name'] table += '#[[d:'+item+'|'+name+']]' size = int(GetArticleSize(lang, artInfo['artKey'])) if size > 0: iw_name = GetArticleInterwikiName(item, lang) if iw_name == '': table += ' ('+str(size) + ')' else: iw_link = lang+':'+iw_name table += ' ([['+iw_link+'|'+str(size)+']])' table += '\n' i+=1 if i >= max_articles: break return table def GetPopularArticles(max_articles): artInfos = GetArticlesSortedByNeglect('en') artInfos.sort(key=lambda x: x['popularity'], reverse=True) i=0 table = '{|class="wikitable sortable" border="1" cellpadding="2" cellspacing="0" style="width:100%; background: #f9f9f9; border: 1px solid #aaaaaa; border-collapse: collapse; white-space: nowrap; text-align: center"' table += '\n|-\n' table += u'!width = 45 | № !! width = 90 | Average Size !! width = 150 | Article Name !! width = 80 | [[Talk:List of Wikipedias by sample of articles#Article metric|Absent
(0k)]] !! width=80| Stubs
(< 10k)!! width = 80 | Articles
(10-30k) !! width = 80 | Long Art.
(> 30k) !! width = 150 | Largest Articles\n' for artInfo in artInfos: i+=1 artKey = artInfo['artKey'] table += '|-\n' table += '|' + str(i) table += '||'+FormatNumber(artInfo['popularity']) table += '||style="text-align:left"|[[d:'+lang_info['en'][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']]' table += '||'+str(GetArticleTypeCount(artKey,0)) table += '||'+str(GetArticleTypeCount(artKey,1)) table += '||'+str(GetArticleTypeCount(artKey,4)) table += '||'+str(GetArticleTypeCount(artKey,9)) table += '||'+GetLargestArticles(artKey,4)+'\n' if i >= max_articles > 0: break table += '|}\n' return table def GetWikiNeglectedArticles(): lang_keys.sort() table = u'' print 'writing Popular Articles...' table += u'==Popular Articles==\n' table += GetPopularArticles(-1) print 'writing Neglected Articles...' table += u'==Neglected Articles==\n' for lang in lang_keys: print ' '+lang if lang_info[lang]['art_count'] > 0: table += u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n' table += GetNeglectedArticles(lang, 10) has_errors = False section = u'====Errors====\n' for index in range(lang_info[lang]['art_count']): artKey = 'art_'+str(index) if lang_info[lang][artKey]['error'] : section = section + '#[[d:'+lang_info[lang][artKey]['item']+'|'+lang_info['en'][artKey]['name']+']] '+lang_info[lang][artKey]['error'] + '\n' has_errors = True if has_errors: table = table + section return table def SaveWikiTableResults(awards): print 'writing Results...' f = open('results.txt', 'w') f.write(GetWikiTableResults(awards).encode("utf_8")) f.close() print 'writing Absent...' f = open('_absent.txt', 'w') f.write(GetWikiTableArticles('absent',1, 250).encode("utf_8")) f.close() print 'writing Stubs...' f = open('_stub.txt', 'w') f.write(GetWikiTableArticles('stubs',1, 100, 250).encode("utf_8")) f.close() print 'writing Articles...' f = open('_articles.txt', 'w') f.write(GetWikiTableArticles('articles',1, 100, 250).encode("utf_8")) f.close() print 'writing Long Articles...' f = open('_longarticles.txt', 'w') f.write(GetWikiTableArticles('longarticles',1,100).encode("utf_8")) f.close() print 'writing Awards...' f = open('_growth.txt', 'w') f.write(GetWikiAwards(awards).encode("utf_8")) f.close() print 'writing Suggestions...' f = open('_neglectedarticles.txt', 'w') f.write(GetWikiNeglectedArticles().encode("utf_8")) f.close() def CookString(rawString): cookString = '' for part in rawString.replace("'","||").split("|"): if len(part)==0: cookString += "'" else: cookString += eval("u'"+part+"'") return cookString def GetGrowths(article): growths = {} lang_last = 0 lang_first = article.find(u'[[:', lang_last) while lang_first > -1: lang_last = article.find(u'|', lang_first) if lang_last == -1: break lang = article[lang_first+3:lang_last-1] score_first = article.find(u'style = "background:',lang_last) if score_first == -1: break score_last = article.find(u'|', score_first+32) if score_last == -1: break growth_end = article.find(u'\n', score_last) growth_str = article[score_last+2:growth_end] try: growth_pipe = growth_str.find(u'|') if growth_pipe > -1: growth_str = growth_str[growth_pipe+1:-2] if growth_str.find(u' ‡') > -1: growth_str = growth_str[0:-2] growth = float(growth_str) except: growth = 0 growths[lang]=growth lang_first = article.find(u'[[:', score_last) return growths def GetLastUpdated(article): date_first = article.find(u'Last Update') if date_first > -1: date_last_paren = article.find(u'(', date_first) date_last_br = article.find(u'
', date_first) if date_last_paren > -1 and date_last_paren < date_last_br : date_last = date_last_paren else: date_last = date_last_br if date_last > -1: hyphen = article.find(u'-', date_first,date_last) if hyphen > -1: date_first = hyphen+1 else: date_first += 12 parts = article[date_first:date_last].strip().split(' ') if len(parts[0])==1: parts[0] = '0'+parts[0] if parts[0][0]==':': parts[0] = '0'+parts[0][1] parts[1] = parts[1][0:3] return ' '.join(parts) growthsG = {} def CalculatePlacing(growths,oldid,update): global growthsG growthsG = growths lang_keys = growths.keys() lang_keys.sort(key=lambda x: growthsG[x], reverse=True) placeNo=0 print update placing = [] for lang in lang_keys: if (placeNo < 3 or growths[lang] > 1) and growths[lang] != None: placeNo += 1 if placeNo==1: placestr = '1st Place' ribbonimg = 'Article blue.svg' elif placeNo==2: placestr = '2nd Place' ribbonimg = 'Article red.svg' elif placeNo==3: placestr = '3rd Place' ribbonimg = 'Article yellow.svg' elif placeNo>3: placestr = 'Honorable Mention' ribbonimg = 'Article green.svg' print " %d %-3s %+2.2f" % (placeNo, lang, growths[lang]) place = {'lang':lang,'growth':growths[lang],'oldid':oldid,'update':update,'placestr':placestr,'ribbonimg':ribbonimg} placing.append(place) return placing def GetPreviousAwards(): article_name = 'List of Wikipedias by sample of articles' meta_wiki = pywikibot.Site('meta', 'meta') meta_page = pywikibot.Page(meta_wiki, article_name) awards = {} prevUpdate = '' prevGrowth = -999 for oldid,datetime,username,comments in meta_page.getVersionHistory(): if datetime.year >= 2009 and ("updat" in comments.lower() or 'correct' in comments.lower()) and oldid!=2228213 and oldid!=2264612 and oldid!=3122655 and oldid!=3359817: article = meta_page.getOldVersion(get_redirect=False,oldid=oldid) growths = GetGrowths(article) if 'en' in growths: update = GetLastUpdated(article) growth = growths['en'] if update != prevUpdate and ( prevGrowth != growth or oldid > 3807780 ): prevUpdate = update prevGrowth = growth awards[update] = CalculatePlacing(growths,oldid,update) return awards def HasAwards(awards, lang): for placings in awards.values(): for place in placings: if lang == place['lang']: return True return False def CompareRows(rowA,rowB): if rowA['place']['placestr']==rowB['place']['placestr']: return cmp(rowB['place']['growth'],rowA['place']['growth']) return cmp(rowA['place']['placestr'],rowB['place']['placestr']) def GetWikiAwards(awards): table = u'==2009-2014 Improvement Awards==\n' for lang in lang_keys: section = u'==='+lang+' [[:w:' + lang_info[lang]['name'] + ' language|' + lang_info[lang]['localname'] + ']]===\n' rows = [] for update, placings in awards.items(): for place in placings: if lang == place['lang']: mid_section = '|-\n' mid_section += '|width = 150 | [[Image:%s|20px]] %s\n' % (place['ribbonimg'],place['placestr']) if place['oldid'] == -1: mid_section += '|width = 120 align=center| [[:m:List of Wikipedias by sample of articles|%s]]\n' % (place['update']) else: mid_section += '|width = 120 align=center| [http://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias_by_sample_of_articles&oldid=%s %s]\n' % (place['oldid'],place['update']) mid_section += '|width = 80 align=center| %+2.2f\n' % round(place['growth'],2) rows.append({'place':place,'mid_section':mid_section}) if len(rows) > 0: rows.sort(CompareRows) if len(rows) > 1: section += '{|class="wikitable sortable" cellpadding="6" cellspacing="0"\n' section += '! !! !!\n' else: section += '{|class="wikitable" cellpadding="6" cellspacing="0"\n' for row in rows: section += row['mid_section'] section += '|}\n' table += section return table def CalculateAwards(): print "calculating awards..." todays = {} for lang in lang_keys: absent = lang_info[lang]['absent'] stubs = lang_info[lang]['stubs'] articles = lang_info[lang]['articles'] longarticles = lang_info[lang]['longarticles'] score = GetScore(absent, stubs, articles, longarticles) growth = GetGrowthNumber(lang, score) todays[lang] = growth update = strftime("%d %b %Y") placing = CalculatePlacing(todays,-1,update) awards = GetPreviousAwards() awards[update] = placing return awards #support dividing up work if len(sys.argv) == 3: part = int(sys.argv[1])-1 numparts = int(sys.argv[2]) lang_keys = filter(lambda lang: lang_keys.index(lang) % numparts == part, lang_keys) GetPreviousScores() CalculateStatistics() awards = CalculateAwards() PrintResults() SaveWikiTableResults(awards) GetItemList.py # -*- coding: utf_8 -*- import sys sys.path.append('./core') import pywikibot import traceback import os article_name = 'List of articles every Wikipedia should have' meta_wiki = pywikibot.Site('meta', 'meta') meta_page = pywikibot.Page(meta_wiki, article_name) article = meta_page.get(get_redirect=False) f = open('ItemList.txt', 'w') count = 0 grand_total = 0 name_last = 0 name_first = article.find(u'[[d:', name_last) while name_first > -1: name_mid = article.find(u'|', name_first) cat_start =article.rfind(u'\n== ', name_last, name_first) if cat_start > -1: cat_end = article.find(u'==',cat_start+3, name_first) if cat_end > -1: cat = article[cat_start+3:cat_end] print print cat print ''.center(len(cat),'-') count = 0 name_last = article.find(u']]', name_first) if name_last > name_mid: name_last = name_mid article_item = article[name_first+4:name_last] f.write(article_item.encode("utf_8")) f.write('\n') count += 1 grand_total += 1 print count, article_item name_first = article.find(u'[[d:', name_last) f.close() print '' print 'GRAND TOTAL' print '-----------' print grand_total, 'articles'