In [1]:
!pip install -U wmpaws
Requirement already satisfied: wmpaws in /srv/paws/lib/python3.8/site-packages (1.0)
Requirement already satisfied: requests in /srv/paws/lib/python3.8/site-packages (from wmpaws) (2.27.1)
Requirement already satisfied: pandas in /srv/paws/lib/python3.8/site-packages (from wmpaws) (1.4.1)
Requirement already satisfied: requests-oauthlib in /srv/paws/lib/python3.8/site-packages (from wmpaws) (1.3.1)
Requirement already satisfied: ipython in /srv/paws/lib/python3.8/site-packages (from wmpaws) (8.1.0)
Requirement already satisfied: pymysql in /srv/paws/lib/python3.8/site-packages (from wmpaws) (1.0.2)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (3.0.28)
Requirement already satisfied: traitlets>=5 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (5.1.1)
Requirement already satisfied: jedi>=0.16 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (0.18.1)
Requirement already satisfied: pexpect>4.3 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (4.8.0)
Requirement already satisfied: pygments>=2.4.0 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (2.11.2)
Requirement already satisfied: backcall in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (0.2.0)
Requirement already satisfied: pickleshare in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (0.7.5)
Requirement already satisfied: decorator in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (5.1.1)
Requirement already satisfied: stack-data in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (0.2.0)
Requirement already satisfied: matplotlib-inline in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (0.1.3)
Requirement already satisfied: setuptools>=18.5 in /srv/paws/lib/python3.8/site-packages (from ipython->wmpaws) (60.9.3)
Requirement already satisfied: pytz>=2020.1 in /srv/paws/lib/python3.8/site-packages (from pandas->wmpaws) (2021.3)
Requirement already satisfied: numpy>=1.18.5 in /srv/paws/lib/python3.8/site-packages (from pandas->wmpaws) (1.22.2)
Requirement already satisfied: python-dateutil>=2.8.1 in /srv/paws/lib/python3.8/site-packages (from pandas->wmpaws) (2.8.2)
Requirement already satisfied: certifi>=2017.4.17 in /srv/paws/lib/python3.8/site-packages (from requests->wmpaws) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /srv/paws/lib/python3.8/site-packages (from requests->wmpaws) (1.26.8)
Requirement already satisfied: idna<4,>=2.5 in /srv/paws/lib/python3.8/site-packages (from requests->wmpaws) (3.3)
Requirement already satisfied: charset-normalizer~=2.0.0 in /srv/paws/lib/python3.8/site-packages (from requests->wmpaws) (2.0.12)
Requirement already satisfied: oauthlib>=3.0.0 in /srv/paws/lib/python3.8/site-packages (from requests-oauthlib->wmpaws) (3.2.0)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in /srv/paws/lib/python3.8/site-packages (from jedi>=0.16->ipython->wmpaws) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /srv/paws/lib/python3.8/site-packages (from pexpect>4.3->ipython->wmpaws) (0.7.0)
Requirement already satisfied: wcwidth in /srv/paws/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->wmpaws) (0.2.5)
Requirement already satisfied: six>=1.5 in /srv/paws/lib/python3.8/site-packages (from python-dateutil>=2.8.1->pandas->wmpaws) (1.16.0)
Requirement already satisfied: asttokens in /srv/paws/lib/python3.8/site-packages (from stack-data->ipython->wmpaws) (2.0.5)
Requirement already satisfied: pure-eval in /srv/paws/lib/python3.8/site-packages (from stack-data->ipython->wmpaws) (0.2.2)
Requirement already satisfied: executing in /srv/paws/lib/python3.8/site-packages (from stack-data->ipython->wmpaws) (0.8.3)

Outline¶

I take all articles in the category about the Russo-Ukrainian war in Russian language Wikipedia, collect their external links and identify whether they are from EU-banned media, Russia-banned media, or generally considered bad sources connected to Russia/Ukraine by English Wikipedia.

I then repeat the same analysis for the sibling categories in Ukrainian language Wikipedia, English language Wikipedia and the Russian involvement in Syria (in Russian) for comparison.

In [649]:
from IPython.display import display, Markdown, Latex, HTML

import os
import requests
from datetime import datetime

import wmpaws
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [567]:
banned_by_eu = [
    'rt.com',
    'sputniknews.com'
]
# https://www.reuters.com/business/media-telecom/russia-restricts-access-bbc-russian-service-radio-liberty-ria-2022-03-04/
banned_by_russia = [
    'bbc.co.uk',
    'bbc.com',
    'voanews.com',
    'rferl.org',
    'dw.com'
]
mainstream_western = [
    'cnn.com',
    'bbc.com',
    'bbc.co.uk',
    'reuters.com'
]
# Any sources on https://en.wikipedia.org/wiki/Wikipedia:Reliable_sources/Perennial_sources 
# with a mention of Ukraine or Russia that are listed as blacklisted, unreliable or state-sponsored fake news
enwiki_badsources = [
    'lenta.ru',
    '112ya.tv',
    'rt.com',
    'sputniknews.com',
    'dan-news.info',
    'news-front-info',
    'rusdialog.ru',
    'southfront.org',
    'topwar.ru',
    'ukraina.ru',
    'veteransnewsnow.com'
]
In [74]:
df = wmpaws.run_sql('''
SELECT DISTINCT
    exl.el_to AS destination,
    article.page_title AS wikipage,
    catlink0.cl_to AS category
FROM
    externallinks AS exl
    INNER JOIN page AS article ON article.page_id = exl.el_from 
    INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
    INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
    INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
    INNER JOIN page AS cat1 ON cat1.page_title = catlink1.cl_to
    INNER JOIN categorylinks AS catlink2 ON catlink2.cl_from = cat1.page_id
WHERE
--    exl.el_index LIKE 'https://com.rt.%' AND
    article.page_namespace = 0 AND
    catlink2.cl_to = 'Российско-украинская_война'
LIMIT 100
;
''', 'ruwiki')
In [382]:
def query_cat5(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category,
        catlink1.cl_to AS cat1,
        catlink2.cl_to AS cat2,
        catlink3.cl_to AS cat3,
        catlink4.cl_to AS cat4,
        catlink5.cl_to AS cat5
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
        INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
        INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
        INNER JOIN page AS cat1 ON cat1.page_title = catlink1.cl_to
        INNER JOIN categorylinks AS catlink2 ON catlink2.cl_from = cat1.page_id
        INNER JOIN page AS cat2 ON cat2.page_title = catlink2.cl_to
        INNER JOIN categorylinks AS catlink3 ON catlink3.cl_from = cat2.page_id
        INNER JOIN page AS cat3 ON cat3.page_title = catlink3.cl_to
        INNER JOIN categorylinks AS catlink4 ON catlink4.cl_from = cat3.page_id
        INNER JOIN page AS cat4 ON cat4.page_title = catlink4.cl_to
        INNER JOIN categorylinks AS catlink5 ON catlink5.cl_from = cat4.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink5.cl_to = ''' + maincat + ''' AND
        catlink5.cl_type = 'subcat' AND
        catlink4.cl_to != ''' + maincat + ''' AND
        catlink4.cl_type = 'subcat' AND
        catlink3.cl_to != ''' + maincat + ''' AND
        catlink3.cl_type = 'subcat' AND
        catlink2.cl_to != ''' + maincat + ''' AND
        catlink2.cl_type = 'subcat' AND
        catlink1.cl_to != ''' + maincat + ''' AND
        catlink1.cl_type = 'subcat' AND
        catlink0.cl_to != ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat5:', len(df))
    return(df)
# query_cat5()
In [383]:
def query_cat4(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
        INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
        INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
        INNER JOIN page AS cat1 ON cat1.page_title = catlink1.cl_to
        INNER JOIN categorylinks AS catlink2 ON catlink2.cl_from = cat1.page_id
        INNER JOIN page AS cat2 ON cat2.page_title = catlink2.cl_to
        INNER JOIN categorylinks AS catlink3 ON catlink3.cl_from = cat2.page_id
        INNER JOIN page AS cat3 ON cat3.page_title = catlink3.cl_to
        INNER JOIN categorylinks AS catlink4 ON catlink4.cl_from = cat3.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink4.cl_to = ''' + maincat + ''' AND
        catlink4.cl_type = 'subcat' AND
        catlink3.cl_to != ''' + maincat + ''' AND
        catlink3.cl_type = 'subcat' AND
        catlink2.cl_to != ''' + maincat + ''' AND
        catlink2.cl_type = 'subcat' AND
        catlink1.cl_to != ''' + maincat + ''' AND
        catlink1.cl_type = 'subcat' AND
        catlink0.cl_to != ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat4:', len(df))
    return(df)
# query_cat4()
In [384]:
def query_cat3(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
        INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
        INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
        INNER JOIN page AS cat1 ON cat1.page_title = catlink1.cl_to
        INNER JOIN categorylinks AS catlink2 ON catlink2.cl_from = cat1.page_id
        INNER JOIN page AS cat2 ON cat2.page_title = catlink2.cl_to
        INNER JOIN categorylinks AS catlink3 ON catlink3.cl_from = cat2.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink3.cl_to = ''' + maincat + ''' AND
        catlink3.cl_type = 'subcat' AND
        catlink2.cl_to != ''' + maincat + ''' AND
        catlink2.cl_type = 'subcat' AND
        catlink1.cl_to != ''' + maincat + ''' AND
        catlink1.cl_type = 'subcat' AND
        catlink0.cl_to != ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat3:', len(df))
    return(df)
# query_cat3()
In [385]:
def query_cat2(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
        INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
        INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
        INNER JOIN page AS cat1 ON cat1.page_title = catlink1.cl_to
        INNER JOIN categorylinks AS catlink2 ON catlink2.cl_from = cat1.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink2.cl_to = ''' + maincat + ''' AND
        catlink2.cl_type = 'subcat' AND
        catlink1.cl_to != ''' + maincat + ''' AND
        catlink1.cl_type = 'subcat' AND
        catlink0.cl_to != ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat2:', len(df))
    return(df)
# query_cat2()
In [386]:
def query_cat1(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
        INNER JOIN page AS cat0 ON cat0.page_title = catlink0.cl_to
        INNER JOIN categorylinks AS catlink1 ON catlink1.cl_from = cat0.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink1.cl_to = ''' + maincat + ''' AND
        catlink1.cl_type = 'subcat' AND
        catlink0.cl_to != ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat1:', len(df))
    return(df)
# query_cat1()
In [387]:
def query_cat0(
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://com.rt.%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    df = wmpaws.run_sql('''
    SELECT DISTINCT
        exl.el_to AS destination,
        article.page_title AS wikipage,
        catlink0.cl_to AS category
    FROM
        externallinks AS exl
        INNER JOIN page AS article ON article.page_id = exl.el_from 
        INNER JOIN categorylinks AS catlink0 ON catlink0.cl_from = article.page_id
    WHERE
        exl.el_index LIKE ''' + referencelink + ''' AND
        article.page_namespace = 0 AND
        catlink0.cl_to = ''' + maincat + ''' AND
        catlink0.cl_type = 'page'
    LIMIT ''' + limit + '''
    ;
    ''', wiki)
    print('cat0:', len(df))
    return(df)
# query_cat0()
In [611]:
def big_query(    
    maincat = '''"Российско-украинская_война"''', 
    referencelink = '''"https://%"''',
    limit = '100',
    wiki = 'ruwiki'
):
    maincat_2015 = '''"Военная_операция_России_в_Сирии"'''
    df = pd.concat([
        query_cat0(
            maincat = maincat, 
            referencelink = referencelink,
            limit = limit,
            wiki = wiki
        ),
        query_cat1(
            maincat = maincat, 
            referencelink = referencelink,
            limit = limit,
            wiki = wiki
        ),
        query_cat2(
            maincat = maincat, 
            referencelink = referencelink,
            limit = limit,
            wiki = wiki
        ),
        query_cat3(
            maincat = maincat, 
            referencelink = referencelink,
            limit = limit,
            wiki = wiki
        ),
        query_cat4(
            maincat = maincat, 
            referencelink = referencelink,
            limit = limit,
            wiki = wiki
        )
    ])
    # clean destinations
    df['destination'] = df['destination'] \
        .str.removeprefix('http://') \
        .str.removeprefix('https://') \
        .str.removeprefix('//') \
        .str.removeprefix('www.')
    return(df)
In [396]:
def extract_archive(df, domain, position):
    if sum(df['domain'] == domain)>0:
        df.loc[df['domain'] == domain,'truedomain'] = df[df['domain'] == domain]['destination'].str.removeprefix(domain).str.split('/', expand=True).loc[:,position].str.removeprefix('www.')
    return(df)

def clean_domains(df):
    df['domain'] = df['destination'].str.split('/', expand=True).loc[:,0]
    df['truedomain'] = df['domain']
    df = extract_archive(df, 'web.archive.org', 5)
    df = extract_archive(df, 'archive.is', 4)
    df = extract_archive(df, 'ghostarchive.org', 5)
    df = extract_archive(df, 'webcitation.org', 3)
    df = extract_archive(df, 'archive.today', 4)
    return(df)
In [577]:
def categorize_sources(df, top_x = 25):
    for domain in banned_by_eu:
        df.loc[df['truedomain'] == domain, 'banned_by_eu'] = True
        df.loc[df['truedomain'].str.endswith('.' + domain) == True, 'banned_by_eu'] = True
    for domain in banned_by_russia:
        df.loc[df['truedomain'] == domain, 'banned_by_russia'] = True
        df.loc[df['truedomain'].str.endswith('.' + domain) == True, 'banned_by_russia'] = True
    for domain in enwiki_badsources:
        df.loc[df['truedomain'] == domain, 'enwiki_badsource'] = True
        df.loc[df['truedomain'].str.endswith('.' + domain) == True, 'enwiki_badsource'] = True
    for domain in mainstream_western:
        df.loc[df['truedomain'] == domain, 'mainstream_west'] = True
        df.loc[df['truedomain'].str.endswith('.' + domain) == True, 'mainstream_west'] = True
    print(df.groupby('truedomain')[['banned_by_eu', 'banned_by_russia', 'mainstream_west', 'enwiki_badsource']].sum().sum())
    print(
        'ratio banned by Europe / banned by Russia:', 
        df.groupby('truedomain')['banned_by_eu'].sum().sum() / df.groupby('truedomain')['banned_by_russia'].sum().sum()
         )
    print(
        df.groupby('truedomain')[['banned_by_eu', 'banned_by_russia', 'mainstream_west', 'enwiki_badsource']].sum()\
            .sort_values(['banned_by_eu', 'banned_by_russia', 'enwiki_badsource', 'mainstream_west'], ascending=False)\
            .head(top_x)
    )
    return(df)
In [751]:
def make_piechart(df, threshold=0.01, filename=False):
    df_agg = df.groupby('truedomain').size().sort_values(ascending=False)
    df_agg = df_agg / df_agg.sum()
    df_agg2 = df_agg[df_agg >= threshold]
    df_agg2['other'] = df_agg[df_agg < threshold].sum()
    cmap = plt.get_cmap('Purples')
    colors = list(cmap(np.linspace(0.8,0.1,len(df_agg2))))
    
    fig, ax = plt.subplots()
    ax.pie(df_agg2.values, colors=colors)
    ax.axis('equal')
    plt.legend(df_agg2.index, bbox_to_anchor = (0,1))
    if filename:
        plt.savefig('ruwiki_links/figs/' + filename, bbox_inches='tight')
    plt.show()

2022_ru¶

All articles in the Russian Wikipedia category about the Russo-Ukrainian war. The category is about 'the armed conflict since 2014'.

https://ru.wikipedia.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B9%D1%81%D0%BA%D0%BE-%D1%83%D0%BA%D1%80%D0%B0%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D0%B2%D0%BE%D0%B9%D0%BD%D0%B0

In [735]:
df_2022_ru = big_query(
    maincat = '''"Российско-украинская_война"''',
    referencelink = '''"https://%"''',
    wiki = 'ruwiki',
    limit = '100000'
)
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat0: 1408
cat1: 7465
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat2: 8366
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat3: 5576
cat4: 1412
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
In [736]:
df_2022_ru = clean_domains(df_2022_ru)
print('number of articles:', len(df_2022_ru['wikipage'].unique()), 'number of links:', len(df_2022_ru), 'unique:', len(df_2022_ru.destination.unique()))
df_2022_ru.groupby('truedomain').size().sort_values(ascending=False)[:10]
number of articles: 677 number of links: 24227 unique: 16164
Out[736]:
truedomain
kommersant.ru      1577
youtube.com         955
ria.ru              931
rbc.ru              742
tass.ru             640
lenta.ru            532
novayagazeta.ru     389
bbc.com             383
meduza.io           318
interfax.ru         275
dtype: int64
In [752]:
make_piechart(df_2022_ru, filename='ruwiki_2022.png')
In [738]:
df_2022_ru = categorize_sources(df_2022_ru, top_x = 15)
banned_by_eu         95
banned_by_russia    603
mainstream_west     724
enwiki_badsource    703
dtype: object
ratio banned by Europe / banned by Russia: 0.15754560530679934
                   banned_by_eu banned_by_russia mainstream_west  \
truedomain                                                         
russian.rt.com               81                0               0   
rt.com                        7                0               0   
inotv.rt.com                  3                0               0   
sputniknews.com               3                0               0   
br.sputniknews.com         True                0               0   
bbc.com                       0              383             383   
dw.com                        0              136               0   
bbc.co.uk                     0               30              30   
rferl.org                     0               26               0   
voanews.com                   0                9               0   
p.dw.com                      0                7               0   
news.bbc.co.uk                0                5               5   
amp.dw.com                    0                3               0   
docs.rferl.org                0                2               0   
m.bbc.co.uk                   0             True            True   

                   enwiki_badsource  
truedomain                           
russian.rt.com                   81  
rt.com                            7  
inotv.rt.com                      3  
sputniknews.com                   3  
br.sputniknews.com             True  
bbc.com                           0  
dw.com                            0  
bbc.co.uk                         0  
rferl.org                         0  
voanews.com                       0  
p.dw.com                          0  
news.bbc.co.uk                    0  
amp.dw.com                        0  
docs.rferl.org                    0  
m.bbc.co.uk                       0  

2015_ru¶

Same analysis, but for the Russian Wikipedia category on the Russian engagement in Syria

In [739]:
df_2015_ru = big_query(
    maincat = '''"Военная_операция_России_в_Сирии"''',
    referencelink = '''"https://%"''',
    wiki = 'ruwiki',
    limit = '100000'
)
cat0: 666
cat1: 546
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat2: 3623
cat3: 0
cat4: 0
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
In [740]:
df_2015_ru = clean_domains(df_2015_ru)
print('number of articles:', len(df_2015_ru['wikipage'].unique()), 'number of links:', len(df_2015_ru), 'unique:', len(df_2015_ru.destination.unique()))
df_2015_ru.groupby('truedomain').size().sort_values(ascending=False)[:10]
number of articles: 240 number of links: 4835 unique: 4147
Out[740]:
truedomain
ria.ru              241
youtube.com         231
kommersant.ru       184
lenta.ru            127
function.mil.ru     120
tass.ru             113
rbc.ru              105
rg.ru               104
structure.mil.ru     93
almasdarnews.com     64
dtype: int64
In [753]:
make_piechart(df_2015_ru, filename='ruwiki_2015.png')
In [742]:
df_2015_ru = categorize_sources(df_2015_ru, top_x = 15)
banned_by_eu         43
banned_by_russia     29
mainstream_west      64
enwiki_badsource    175
dtype: object
ratio banned by Europe / banned by Russia: 1.4827586206896552
                      banned_by_eu banned_by_russia mainstream_west  \
truedomain                                                            
russian.rt.com                  40                0               0   
rt.com                           3                0               0   
bbc.com                          0               22              22   
dw.com                           0                4               0   
bbc.co.uk                        0             True            True   
monitoring.bbc.co.uk             0             True            True   
rferl.org                        0             True               0   
lenta.ru                         0                0               0   
dan-news.info                    0                0               0   
ukraina.ru                       0                0               0   
southfront.org                   0                0               0   
reuters.com                      0                0              32   
ru.reuters.com                   0                0               4   
uk.reuters.com                   0                0               2   
ca.reuters.com                   0                0            True   

                     enwiki_badsource  
truedomain                             
russian.rt.com                     40  
rt.com                              3  
bbc.com                             0  
dw.com                              0  
bbc.co.uk                           0  
monitoring.bbc.co.uk                0  
rferl.org                           0  
lenta.ru                          127  
dan-news.info                       2  
ukraina.ru                          2  
southfront.org                   True  
reuters.com                         0  
ru.reuters.com                      0  
uk.reuters.com                      0  
ca.reuters.com                      0  

2022_uk¶

Same analysis, for the sibling category on Ukrainian language Wikipedia.

In [743]:
df_2022_uk = big_query(
    maincat = '''"Російська_збройна_агресія_проти_України_(з_2014)"''',
    referencelink = '''"https://%"''',
    wiki = 'ukwiki',
    limit = '100000'
)
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat0: 5802
cat1: 13709
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat2: 54505
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat3: 61236
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat4: 47023
In [744]:
df_2022_uk = clean_domains(df_2022_uk)
print('number of articles:', len(df_2022_uk['wikipage'].unique()), 'number of links:', len(df_2022_uk), 'unique:', len(df_2022_uk.destination.unique()))
df_2022_uk.groupby('truedomain').size().sort_values(ascending=False)#[:10]
number of articles: 11669 number of links: 182275 unique: 62489
Out[744]:
truedomain
zakon.rada.gov.ua    13228
youtube.com          12042
facebook.com         10792
tools.wmflabs.org     4986
pravda.com.ua         3379
                     ...  
npu.go.jp                1
gbp.com.sg               1
gblor.ru                 1
candidate.if.ua          1
kod-ua.com               1
Length: 8127, dtype: int64
In [754]:
make_piechart(df_2022_uk, filename='ukwiki_2022.png')
In [746]:
df_2022_uk = categorize_sources(df_2022_uk, top_x = 15)
banned_by_eu          41
banned_by_russia    1411
mainstream_west     1407
enwiki_badsource     298
dtype: object
ratio banned by Europe / banned by Russia: 0.029057406094968107
                       banned_by_eu banned_by_russia  mainstream_west  \
truedomain                                                              
russian.rt.com                   18                0                0   
sputniknews.com                  13                0                0   
actualidad.rt.com                 4                0                0   
inotv.rt.com                      4                0                0   
rt.com                            2                0                0   
bbc.com                           0              856              856   
dw.com                            0              280                0   
bbc.co.uk                         0              122              122   
ukrainian.voanews.com             0               45                0   
rferl.org                         0               39                0   
p.dw.com                          0               36                0   
voanews.com                       0               16                0   
news.bbc.co.uk                    0                7                7   
m.bbc.co.uk                       0                4                4   
www1.voanews.com                  0                3                0   

                       enwiki_badsource  
truedomain                               
russian.rt.com                       18  
sputniknews.com                      13  
actualidad.rt.com                     4  
inotv.rt.com                          4  
rt.com                                2  
bbc.com                               0  
dw.com                                0  
bbc.co.uk                             0  
ukrainian.voanews.com                 0  
rferl.org                             0  
p.dw.com                              0  
voanews.com                           0  
news.bbc.co.uk                        0  
m.bbc.co.uk                           0  
www1.voanews.com                      0  

2022_en¶

Same analysis, of the sibling category in English language Wikipedia

In [747]:
df_2022_en = big_query(
    maincat = '''"Russo-Ukrainian_War"''',
    referencelink = '''"https://%"''',
    wiki = 'enwiki',
    limit = '100000'
)
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat0: 8418
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat1: 13921
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat2: 9318
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat3: 9952
/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:761: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy
  warnings.warn(
cat4: 6758
In [756]:
df_2022_en = clean_domains(df_2022_en)
print('number of articles:', len(df_2022_en['wikipage'].unique()), 'number of links:', len(df_2022_en), 'unique:', len(df_2022_en.destination.unique()))
df_2022_en.groupby('truedomain').size().sort_values(ascending=False)#[:10]
number of articles: 615 number of links: 48367 unique: 22247
Out[756]:
truedomain
reuters.com            2547
bbc.com                2254
theguardian.com        1576
nytimes.com            1060
washingtonpost.com      782
                       ... 
fiba.basketball           1
sedmitza.ru               1
leparisien.fr             1
semana.com                1
festival-cannes.com       1
Length: 3509, dtype: int64
In [755]:
make_piechart(df_2022_en, filename='enwiki_2022.png')
In [750]:
df_2022_en = categorize_sources(df_2022_en, top_x = 15)
banned_by_eu           0
banned_by_russia    3775
mainstream_west     6167
enwiki_badsource      58
dtype: object
ratio banned by Europe / banned by Russia: 0.0
                      banned_by_eu banned_by_russia mainstream_west  \
truedomain                                                            
bbc.com                          0             2254            2254   
bbc.co.uk                        0              534             534   
rferl.org                        0              531               0   
dw.com                           0              349               0   
voanews.com                      0               65               0   
news.bbc.co.uk                   0               20              20   
m.bbc.co.uk                      0                6               6   
m.dw.com                         0                5               0   
m.voanews.com                    0                4               0   
pressroom.rferl.org              0                3               0   
amp.dw.com                       0             True               0   
docs.rferl.org                   0             True               0   
flashvideo.rferl.org             0             True               0   
p.dw.com                         0             True               0   
lenta.ru                         0                0               0   

                      enwiki_badsource  
truedomain                              
bbc.com                              0  
bbc.co.uk                            0  
rferl.org                            0  
dw.com                               0  
voanews.com                          0  
news.bbc.co.uk                       0  
m.bbc.co.uk                          0  
m.dw.com                             0  
m.voanews.com                        0  
pressroom.rferl.org                  0  
amp.dw.com                           0  
docs.rferl.org                       0  
flashvideo.rferl.org                 0  
p.dw.com                             0  
lenta.ru                            58  
In [ ]: