This notebook serves to sort German Wikipedia section headers by frequency as related to this research project.
import numpy as np
import pandas as pd
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('dewiki_20161101_headings.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)
# concatenate all rows into a pandas dataframe
de_DF = pd.concat([chunk for chunk in tp])
de_DF.head()
# remove leading and trailing whitespace from heading_text column
de_DF['heading_text'] = pd.core.strings.str_strip(de_DF['heading_text'])
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descending order
# this returns a pandas series object
article_count = de_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)
# turn pandas series object into pandas dataframe
de_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})
# add a column for the percentage of articles that header appears in
de_article_count_DF['article_percentage'] = (de_article_count_DF['number_of_articles']/1993198)*100
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
de_article_count_DF.round({'article_percentage': 2}).head(100)