In [5]:
!pip install tensorflow keras
Collecting tensorflow
  Downloading https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (109.2MB)
    100% |████████████████████████████████| 109.2MB 5.0kB/s eta 0:00:01   15% |████▉                           | 16.5MB 17.1MB/s eta 0:00:06    16% |█████▎                          | 17.8MB 24.2MB/s eta 0:00:04    18% |██████                          | 20.5MB 59.8MB/s eta 0:00:02    33% |██████████▋                     | 36.3MB 45.9MB/s eta 0:00:02    42% |█████████████▌                  | 45.9MB 32.7MB/s eta 0:00:02    52% |█████████████████               | 57.8MB 48.4MB/s eta 0:00:02    70% |██████████████████████▋         | 77.3MB 511kB/s eta 0:01:03    76% |████████████████████████▋       | 84.0MB 30.8MB/s eta 0:00:01    89% |████████████████████████████▌   | 97.4MB 36.8MB/s eta 0:00:01
Collecting astor>=0.6.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz
Collecting google-pasta>=0.1.6 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/c3/fd/1e86bc4837cc9a3a5faf3db9b1854aa04ad35b5f381f9648fbe81a6f94e4/google_pasta-0.1.8-py3-none-any.whl (57kB)
    100% |████████████████████████████████| 61kB 1.1MB/s eta 0:00:01
Collecting grpcio>=1.8.6 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/f1/97/bece4417f349f8f83252232ef66ea63eb47f8044ca61b51e2a478e2c7a94/grpcio-1.27.2-cp36-cp36m-manylinux1_x86_64.whl (2.7MB)
    100% |████████████████████████████████| 2.7MB 183kB/s eta 0:00:01
Collecting wheel>=0.26 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/8c/23/848298cccf8e40f5bbb59009b32848a4c38f4e7f3364297ab3c3e2e2cd14/wheel-0.34.2-py2.py3-none-any.whl
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed0510d/tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488kB)
    100% |████████████████████████████████| 491kB 829kB/s eta 0:00:01
Collecting tensorboard<1.15.0,>=1.14.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
    100% |████████████████████████████████| 3.2MB 168kB/s eta 0:00:011
Requirement already satisfied: keras-preprocessing>=1.0.5 in /srv/paws/lib/python3.6/site-packages (from tensorflow)
Requirement already satisfied: keras-applications>=1.0.6 in /srv/paws/lib/python3.6/site-packages (from tensorflow)
Collecting protobuf>=3.6.1 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/57/02/5432412c162989260fab61fa65e0a490c1872739eb91a659896e4d554b26/protobuf-3.11.3-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
    100% |████████████████████████████████| 1.3MB 402kB/s  eta 0:00:01
Collecting wrapt>=1.11.1 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/ee/bc/7993faa8084b5a5dbabb07a197ae1b7590da4752dc80455d878573553e2f/wrapt-1.12.0.tar.gz
Requirement already satisfied: six>=1.10.0 in /srv/paws/lib/python3.6/site-packages (from tensorflow)
Requirement already satisfied: numpy<2.0,>=1.14.5 in /srv/paws/lib/python3.6/site-packages (from tensorflow)
Collecting absl-py>=0.7.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/1a/53/9243c600e047bd4c3df9e69cfabc1e8004a82cac2e0c484580a78a94ba2a/absl-py-0.9.0.tar.gz (104kB)
    100% |████████████████████████████████| 112kB 2.7MB/s eta 0:00:01
Collecting gast>=0.2.0 (from tensorflow)
  Downloading https://files.pythonhosted.org/packages/d6/84/759f5dd23fec8ba71952d97bcc7e2c9d7d63bdc582421f3cd4be845f0c98/gast-0.3.3-py2.py3-none-any.whl
Collecting setuptools>=41.0.0 (from tensorboard<1.15.0,>=1.14.0->tensorflow)
  Downloading https://files.pythonhosted.org/packages/3d/72/1c1498c1e908e0562b1e1cd30012580baa7d33b5b0ffdbeb5fde2462cc71/setuptools-45.2.0-py3-none-any.whl (584kB)
    100% |████████████████████████████████| 593kB 684kB/s eta 0:00:01   17% |█████▋                          | 102kB 487kB/s eta 0:00:01
Requirement already satisfied: werkzeug>=0.11.15 in /srv/paws/lib/python3.6/site-packages (from tensorboard<1.15.0,>=1.14.0->tensorflow)
Collecting markdown>=2.6.8 (from tensorboard<1.15.0,>=1.14.0->tensorflow)
  Downloading https://files.pythonhosted.org/packages/ab/c4/ba46d44855e6eb1770a12edace5a165a0c6de13349f592b9036257f3c3d3/Markdown-3.2.1-py2.py3-none-any.whl (88kB)
    100% |████████████████████████████████| 92kB 372kB/s ta 0:00:011
Requirement already satisfied: h5py in /srv/paws/lib/python3.6/site-packages (from keras-applications>=1.0.6->tensorflow)
Building wheels for collected packages: termcolor, wrapt, absl-py
  Running setup.py bdist_wheel for termcolor ... error
  Complete output from command /srv/paws/bin/python3.6 -u -c "import setuptools, tokenize;__file__='/tmp/pip-build-ayy99os1/termcolor/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/tmptooyfnixpip-wheel- --python-tag cp36:
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for termcolor
  Running setup.py clean for termcolor
  Running setup.py bdist_wheel for wrapt ... error
  Complete output from command /srv/paws/bin/python3.6 -u -c "import setuptools, tokenize;__file__='/tmp/pip-build-ayy99os1/wrapt/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/tmp0k93uzmspip-wheel- --python-tag cp36:
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for wrapt
  Running setup.py clean for wrapt
  Running setup.py bdist_wheel for absl-py ... error
  Complete output from command /srv/paws/bin/python3.6 -u -c "import setuptools, tokenize;__file__='/tmp/pip-build-ayy99os1/absl-py/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/tmpkl3edayopip-wheel- --python-tag cp36:
  /usr/lib/python3.6/distutils/dist.py:261: UserWarning: Unknown distribution option: 'long_description_content_type'
    warnings.warn(msg)
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for absl-py
  Running setup.py clean for absl-py
Failed to build termcolor wrapt absl-py
Installing collected packages: astor, termcolor, google-pasta, grpcio, wheel, tensorflow-estimator, setuptools, protobuf, absl-py, markdown, tensorboard, wrapt, gast, tensorflow
  Running setup.py install for termcolor ... done
  Found existing installation: setuptools 39.0.1
    Uninstalling setuptools-39.0.1:
      Successfully uninstalled setuptools-39.0.1
  Running setup.py install for absl-py ... done
  Running setup.py install for wrapt ... done
Successfully installed absl-py-0.9.0 astor-0.8.1 gast-0.3.3 google-pasta-0.1.8 grpcio-1.27.2 markdown-3.2.1 protobuf-3.11.3 setuptools-45.2.0 tensorboard-1.14.0 tensorflow-1.14.0 tensorflow-estimator-1.14.0 termcolor-1.1.0 wheel-0.34.2 wrapt-1.12.0
Requirement already satisfied: keras in /srv/paws/lib/python3.6/site-packages
Requirement already satisfied: numpy>=1.9.1 in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: keras-preprocessing>=1.0.5 in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: six>=1.9.0 in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: pyyaml in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: h5py in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: keras-applications>=1.0.6 in /srv/paws/lib/python3.6/site-packages (from keras)
Requirement already satisfied: scipy>=0.14 in /srv/paws/lib/python3.6/site-packages (from keras)
In [11]:
!pip install request
Collecting request
  Downloading https://files.pythonhosted.org/packages/f1/27/7cbde262d854aedf217061a97020d66a63163c5c04e0ec02ff98c5d8f44e/request-2019.4.13.tar.gz
Collecting get (from request)
  Downloading https://files.pythonhosted.org/packages/3f/ef/bb46f77f7220ac1b7edba0c76d810c89fddb24ddd8c08f337b9b4a618db7/get-2019.4.13.tar.gz
Collecting post (from request)
  Downloading https://files.pythonhosted.org/packages/0f/05/bd79da5849ea6a92485ed7029ef97b1b75e55c26bc0ed3a7ec769af666f3/post-2019.4.13.tar.gz
Requirement already satisfied: setuptools in /srv/paws/lib/python3.6/site-packages (from request)
Collecting query_string (from get->request)
  Downloading https://files.pythonhosted.org/packages/12/3c/412a45daf5bea9b1d06d7de41787ec4168001dfa418db7ec8723356b119f/query-string-2019.4.13.tar.gz
Collecting public (from query_string->get->request)
  Downloading https://files.pythonhosted.org/packages/54/4d/b40004cc6c07665e48af22cfe1e631f219bf4282e15fa76a5b6364f6885c/public-2019.4.13.tar.gz
Building wheels for collected packages: request, get, post, query-string, public
  Running setup.py bdist_wheel for request ... done
  Stored in directory: /home/paws/.cache/pip/wheels/30/84/5f/484cfba678967ef58c16fce6890925d5c7172622f20111fbfd
  Running setup.py bdist_wheel for get ... done
  Stored in directory: /home/paws/.cache/pip/wheels/c1/e3/c1/d02c8c58538853e4c9b78cadb74f6d5c5c370b48a69a7271aa
  Running setup.py bdist_wheel for post ... done
  Stored in directory: /home/paws/.cache/pip/wheels/c3/c3/24/b5c132b537ab380c02d69e6bd4dec1f5db56b5fe19030473d7
  Running setup.py bdist_wheel for query-string ... done
  Stored in directory: /home/paws/.cache/pip/wheels/d6/a4/78/01b20a9dc224dcc009fab669f7f27b943b8889c5150bd68d8a
  Running setup.py bdist_wheel for public ... done
  Stored in directory: /home/paws/.cache/pip/wheels/23/7c/6e/f5b4e09d6596c8b8802b347e48f149031e2363368048f1347a
Successfully built request get post query-string public
Installing collected packages: public, query-string, get, post, request
Successfully installed get-2019.4.13 post-2019.4.13 public-2019.4.13 query-string-2019.4.13 request-2019.4.13
In [14]:
import requests
from bs4 import BeautifulSoup
import os
import time
from keras.utils import get_file
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax

import subprocess
import re
import mwparserfromhell
import json
In [15]:
index = requests.get('https://dumps.wikimedia.org/enwiki/').text
In [16]:
soup_index = BeautifulSoup(index, 'html.parser')
In [17]:
dumps = [a['href'] for a in soup_index.find_all('a') 
             if a.has_attr('href') and a.text[:-1].isdigit()]
dumps
Out[17]:
['20191120/',
 '20191201/',
 '20191220/',
 '20200101/',
 '20200120/',
 '20200201/',
 '20200220/']
In [18]:
for dump_url in sorted(dumps, reverse=True):
    print(dump_url)
    dump_html = index = requests.get('https://dumps.wikimedia.org/enwiki/' + dump_url).text
    soup_dump = BeautifulSoup(dump_html, 'html.parser')
    pages_xml = [a['href'] for a in soup_dump.find_all('a') 
                 if a.has_attr('href') and a['href'].endswith('-pages-articles.xml.bz2')]
    if pages_xml:
        break
    time.sleep(0.8)
20200220/
In [19]:
wikipedia_dump = pages_xml[0].rsplit('/')[-1]
url = url = 'https://dumps.wikimedia.org/' + pages_xml[0] 
path = get_file(wikipedia_dump, url)
path
Downloading data from https://dumps.wikimedia.org//enwiki/20200220/enwiki-20200220-pages-articles.xml.bz2
16939614208/16939609751 [==============================] - 7792s 0us/step
Out[19]:
'/home/paws/.keras/datasets/enwiki-20200220-pages-articles.xml.bz2'
In [20]:
def process_article(title, text):
    rotten = [(re.findall('\d\d?\d?%', p), re.findall('\d\.\d\/\d+|$', p), p.lower().find('rotten tomatoes')) for p in text.split('\n\n')]
    rating = next(((perc[0], rating[0]) for perc, rating, idx in rotten if len(perc) == 1 and idx > -1), (None, None))
    wikicode = mwparserfromhell.parse(text)
    film = next((template for template in wikicode.filter_templates() 
                 if template.name.strip().lower() == 'infobox film'), None)
    if film:
        properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                      for param in film.params
                      if param.value.strip_code().strip()
                     }
        links = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
        return (title, properties, links) + rating
In [21]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._movies = []
        self._curent_tag = None

    def characters(self, content):
        if self._curent_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            self._curent_tag = name
            self._buffer = []

    def endElement(self, name):
        if name == self._curent_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            movie = process_article(**self._values)
            if movie:
                self._movies.append(movie)
In [ ]:
parser = xml.sax.make_parser()
handler = WikiXmlHandler()
parser.setContentHandler(handler)
every=1000000
idx=0
last_idx=0
for line in subprocess.Popen(['bzcat'], stdin=open(path), stdout=subprocess.PIPE).stdout:
    try:
        parser.feed(line)
        idx+=1
        if idx % every == 0:
            print(str(idx) +"  - Lines from bzcat")
            if( last_idx != len(handler._movies)):
                actual_idx = len(handler._movies) - 1
                print(f"Actual MOVIES ID: {actual_idx}, with: {idx} Lines...")
                last_idx += 1
                with open('wp_movies.ndjson', 'at') as fout:
                    fout.write(json.dumps(handler._movies[actual_idx]) + '\n')
    except StopIteration:
        break
1000000  - Lines from bzcat
Actual MOVIES ID: 16, with: 1000000 Lines...
2000000  - Lines from bzcat
Actual MOVIES ID: 46, with: 2000000 Lines...
3000000  - Lines from bzcat
Actual MOVIES ID: 52, with: 3000000 Lines...
4000000  - Lines from bzcat
Actual MOVIES ID: 67, with: 4000000 Lines...
5000000  - Lines from bzcat
Actual MOVIES ID: 105, with: 5000000 Lines...
6000000  - Lines from bzcat
Actual MOVIES ID: 119, with: 6000000 Lines...
7000000  - Lines from bzcat
Actual MOVIES ID: 168, with: 7000000 Lines...
8000000  - Lines from bzcat
Actual MOVIES ID: 199, with: 8000000 Lines...
9000000  - Lines from bzcat
Actual MOVIES ID: 289, with: 9000000 Lines...
10000000  - Lines from bzcat
Actual MOVIES ID: 469, with: 10000000 Lines...
11000000  - Lines from bzcat
Actual MOVIES ID: 583, with: 11000000 Lines...
12000000  - Lines from bzcat
Actual MOVIES ID: 618, with: 12000000 Lines...
13000000  - Lines from bzcat
Actual MOVIES ID: 642, with: 13000000 Lines...
14000000  - Lines from bzcat
Actual MOVIES ID: 702, with: 14000000 Lines...
15000000  - Lines from bzcat
Actual MOVIES ID: 702, with: 15000000 Lines...
16000000  - Lines from bzcat
Actual MOVIES ID: 718, with: 16000000 Lines...
17000000  - Lines from bzcat
Actual MOVIES ID: 720, with: 17000000 Lines...
18000000  - Lines from bzcat
Actual MOVIES ID: 737, with: 18000000 Lines...
19000000  - Lines from bzcat
Actual MOVIES ID: 746, with: 19000000 Lines...
20000000  - Lines from bzcat
Actual MOVIES ID: 803, with: 20000000 Lines...
21000000  - Lines from bzcat
Actual MOVIES ID: 846, with: 21000000 Lines...
22000000  - Lines from bzcat
Actual MOVIES ID: 928, with: 22000000 Lines...
23000000  - Lines from bzcat
Actual MOVIES ID: 1026, with: 23000000 Lines...
24000000  - Lines from bzcat
Actual MOVIES ID: 1106, with: 24000000 Lines...
25000000  - Lines from bzcat
Actual MOVIES ID: 1157, with: 25000000 Lines...
26000000  - Lines from bzcat
Actual MOVIES ID: 1201, with: 26000000 Lines...
27000000  - Lines from bzcat
Actual MOVIES ID: 1268, with: 27000000 Lines...
28000000  - Lines from bzcat
Actual MOVIES ID: 1337, with: 28000000 Lines...
29000000  - Lines from bzcat
Actual MOVIES ID: 1401, with: 29000000 Lines...
30000000  - Lines from bzcat
Actual MOVIES ID: 1436, with: 30000000 Lines...
31000000  - Lines from bzcat
Actual MOVIES ID: 1484, with: 31000000 Lines...
32000000  - Lines from bzcat
Actual MOVIES ID: 1531, with: 32000000 Lines...
33000000  - Lines from bzcat
Actual MOVIES ID: 1583, with: 33000000 Lines...
34000000  - Lines from bzcat
Actual MOVIES ID: 1610, with: 34000000 Lines...
35000000  - Lines from bzcat
Actual MOVIES ID: 1643, with: 35000000 Lines...
36000000  - Lines from bzcat
Actual MOVIES ID: 1697, with: 36000000 Lines...
37000000  - Lines from bzcat
Actual MOVIES ID: 1733, with: 37000000 Lines...
38000000  - Lines from bzcat
Actual MOVIES ID: 1762, with: 38000000 Lines...
40000000  - Lines from bzcat
Actual MOVIES ID: 1841, with: 40000000 Lines...
41000000  - Lines from bzcat
Actual MOVIES ID: 1884, with: 41000000 Lines...
42000000  - Lines from bzcat
Actual MOVIES ID: 1963, with: 42000000 Lines...
43000000  - Lines from bzcat
Actual MOVIES ID: 2019, with: 43000000 Lines...
44000000  - Lines from bzcat
Actual MOVIES ID: 2071, with: 44000000 Lines...
45000000  - Lines from bzcat
Actual MOVIES ID: 2111, with: 45000000 Lines...
46000000  - Lines from bzcat
Actual MOVIES ID: 2142, with: 46000000 Lines...
47000000  - Lines from bzcat
Actual MOVIES ID: 2194, with: 47000000 Lines...
49000000  - Lines from bzcat
Actual MOVIES ID: 2308, with: 49000000 Lines...
50000000  - Lines from bzcat
Actual MOVIES ID: 2369, with: 50000000 Lines...
51000000  - Lines from bzcat
Actual MOVIES ID: 2419, with: 51000000 Lines...
52000000  - Lines from bzcat
Actual MOVIES ID: 2458, with: 52000000 Lines...
53000000  - Lines from bzcat
Actual MOVIES ID: 2510, with: 53000000 Lines...
54000000  - Lines from bzcat
Actual MOVIES ID: 2573, with: 54000000 Lines...
55000000  - Lines from bzcat
Actual MOVIES ID: 2675, with: 55000000 Lines...
56000000  - Lines from bzcat
Actual MOVIES ID: 2735, with: 56000000 Lines...
57000000  - Lines from bzcat
Actual MOVIES ID: 2784, with: 57000000 Lines...
58000000  - Lines from bzcat
Actual MOVIES ID: 2830, with: 58000000 Lines...
59000000  - Lines from bzcat
Actual MOVIES ID: 2882, with: 59000000 Lines...
60000000  - Lines from bzcat
Actual MOVIES ID: 2939, with: 60000000 Lines...
61000000  - Lines from bzcat
Actual MOVIES ID: 2981, with: 61000000 Lines...
62000000  - Lines from bzcat
Actual MOVIES ID: 3043, with: 62000000 Lines...
63000000  - Lines from bzcat
Actual MOVIES ID: 3099, with: 63000000 Lines...
64000000  - Lines from bzcat
Actual MOVIES ID: 3164, with: 64000000 Lines...
65000000  - Lines from bzcat
Actual MOVIES ID: 3233, with: 65000000 Lines...
68000000  - Lines from bzcat
Actual MOVIES ID: 3457, with: 68000000 Lines...
69000000  - Lines from bzcat
Actual MOVIES ID: 3507, with: 69000000 Lines...
70000000  - Lines from bzcat
Actual MOVIES ID: 3568, with: 70000000 Lines...
71000000  - Lines from bzcat
Actual MOVIES ID: 3655, with: 71000000 Lines...
72000000  - Lines from bzcat
Actual MOVIES ID: 3720, with: 72000000 Lines...
73000000  - Lines from bzcat
Actual MOVIES ID: 3776, with: 73000000 Lines...
74000000  - Lines from bzcat
Actual MOVIES ID: 3822, with: 74000000 Lines...
75000000  - Lines from bzcat
Actual MOVIES ID: 3875, with: 75000000 Lines...
76000000  - Lines from bzcat
Actual MOVIES ID: 3924, with: 76000000 Lines...
77000000  - Lines from bzcat
Actual MOVIES ID: 3996, with: 77000000 Lines...
78000000  - Lines from bzcat
Actual MOVIES ID: 4087, with: 78000000 Lines...
79000000  - Lines from bzcat
Actual MOVIES ID: 4146, with: 79000000 Lines...
80000000  - Lines from bzcat
Actual MOVIES ID: 4217, with: 80000000 Lines...
81000000  - Lines from bzcat
Actual MOVIES ID: 4293, with: 81000000 Lines...
82000000  - Lines from bzcat
Actual MOVIES ID: 4345, with: 82000000 Lines...
83000000  - Lines from bzcat
Actual MOVIES ID: 4415, with: 83000000 Lines...
84000000  - Lines from bzcat
Actual MOVIES ID: 4489, with: 84000000 Lines...
85000000  - Lines from bzcat
Actual MOVIES ID: 4628, with: 85000000 Lines...
86000000  - Lines from bzcat
Actual MOVIES ID: 4727, with: 86000000 Lines...
87000000  - Lines from bzcat
Actual MOVIES ID: 4791, with: 87000000 Lines...
88000000  - Lines from bzcat
Actual MOVIES ID: 4843, with: 88000000 Lines...
89000000  - Lines from bzcat
Actual MOVIES ID: 4917, with: 89000000 Lines...
90000000  - Lines from bzcat
Actual MOVIES ID: 4978, with: 90000000 Lines...
91000000  - Lines from bzcat
Actual MOVIES ID: 5035, with: 91000000 Lines...
92000000  - Lines from bzcat
Actual MOVIES ID: 5105, with: 92000000 Lines...
93000000  - Lines from bzcat
Actual MOVIES ID: 5217, with: 93000000 Lines...
94000000  - Lines from bzcat
Actual MOVIES ID: 5306, with: 94000000 Lines...
In [ ]:
with open('wp_movies_full.ndjson', 'wt') as fout:
    for movie in handler._movies:
         fout.write(json.dumps(movie) + '\n')
In [ ]: