Archiving Discourse

This Jupyter notebook contains the Python code I use to auto-archive my Discourse instances using the API. You can read more about this in my question on DiscourseMeta.

If you're reading the HTML version off of my webpage, the actual notebook file should be here. There's also a version on GitHub.

As an example, let's try to archive a bit of https://meta.discourse.org. Note that this site is really too big to archive the whole thing - at least for me. You can adjust how much you want to download by fiddling with the max_more_topics parameter in code block 6 below. For the purposes of this demo, it's the parameter is set to 5. You can you can examine the archived version here.

In [1]:
# Archive meta.discourse.org

# Be sure to define the base_url of the Discourse instance, 
# the path of the directory to save stuff on the local machine, 
# and a blurb to describe the site.

# Note that the directory specified by `path` will be overwritten.

import os
from datetime import date
base_url = 'https://meta.discourse.org'
path = os.path.join(os.getcwd(), 'meta_discourse')
archive_blurb = "A partial archive of meta.discourse.org as of " + \
    date.today().strftime("%A %B %d, %Y") + '.'

From here, the code should just work, assuming the following libraries are all installed. You might need to install requests, PIL, and BeautifulSoup; they don't come with every Python distribution.

In [2]:
import requests, base64, sys
from urllib.parse import urlparse
from bs4 import BeautifulSoup as bs
from PIL import Image
from io import BytesIO
import requests, base64
from urllib.parse import urlparse
from time import sleep

from shutil import rmtree
In [3]:
# When archiving larger sites (like meta.discourse.org), you might need to 
# increase the number of retries to connect.
# Doesn't seem to be necessary for my site but it *is* necessary for Meta.

from requests.adapters import HTTPAdapter

s = requests.Session()
s.mount(base_url, HTTPAdapter(max_retries=5))
In [4]:
# Templates for the webpages
base_scheme = urlparse(base_url).scheme

# Template for the main page. Subsequent code will replace a few items indicated by
# <!-- COMMENTS -->
main_template = """<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8" />
    <!-- TITLE -->
    <meta name="viewport" content="width=device-width">
    <link rel="stylesheet" href="https://use.fontawesome.com/2374bdec1c.css">
    <link rel="stylesheet" href="./archived.css" />
  </head>
  
  <body>
    <header class="header">
      <div class="title-span">
        <h1 class="site-title">
          <img src="images/site-logo.png" height="66" alt="<!-- JUST_SITE_TITLE -->" id="site-logo">
        </h1>
      </div>
    </header>

    <div class="main">
      <div class="archive-span"><!-- ARCHIVE_BLURB --></div>
      <div class="topics">
        <div class="header-row">
          <span class="topic-head">Topics</span>
          <span class="category-head">Category</span>
          <span class="post-count-head">Posts</span>
        </div>
        <!-- TOPIC_LIST -->
      </div>
    </div>
  </body>
</html>
"""


# Template for the individual topic pages
topic_template = """<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width">
    <title><!-- TOPIC_TITLE --></title>
    <link rel="stylesheet" href="../../../archived.css" />
    <script type="text/x-mathjax-config">
      MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\\\(','\\\\)']]}});
    </script>
    <script type="text/javascript" async
      src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML">
    </script>
  </head>
  
  <body>
    <header class="header">
      <div class="title-span">
        <a href="../../../">
          <img src="../../../images/site-logo.png" height="66" alt="<!-- JUST_SITE_TITLE -->" id="site-logo" />
        </a>
      </div>
    </header>

    <div class="main">
    <div class="archive-span"><!-- ARCHIVE_BLURB --></div>
    <h1 class="topic-title"><!-- TOPIC_TITLE --></h1>
      <!-- POST_LIST -->
    </div>
  </body>
</html>
"""



# Function that writes out each individual topic page
def write_topic(topic_json):
    topic_download_url = base_url + '/t/' + topic_json['slug']+'/'+str(topic_json['id'])
    topic_relative_url = 't/' + topic_json['slug']+'/'+str(topic_json['id'])
    try:
        os.makedirs(topic_relative_url)
    except Exception as err:
        print('in write_topic error:', 'make directory')
    response = requests.get(topic_download_url + '.json')
    posts_json = response.json()['post_stream']['posts']
    post_list_string = ""
    for post_json in posts_json:
        post_list_string = post_list_string + post_row(post_json)
    topic_file_string = topic_template \
        .replace("<!-- TOPIC_TITLE -->", topic_json['fancy_title']) \
        .replace("<!-- JUST_SITE_TITLE -->", str(site_title.text)) \
        .replace("<!-- ARCHIVE_BLURB -->", archive_blurb) \
        .replace("<!-- POST_LIST -->", post_list_string)

    f = open(topic_relative_url + '/index.html', 'w')
    f.write(topic_file_string)
    f.close()

# Function that creates the text describing the individual posts in a topic
def post_row(post_json):
    avatar_url = post_json['avatar_template']
    parsed_url = urlparse(avatar_url)
    path = parsed_url.path
    avatar_file_name = path.split('/')[-1]
    if(parsed_url.netloc and parsed_url.scheme):
        pass
    elif(parsed_url.netloc):
        avatar_url = base_scheme + ':' + avatar_url
    else:
        avatar_url = base_url + avatar_url
#    if(not parsed_url.scheme):
#        if avatar_url[0] == '/':
#            avatar_url = base_url + avatar_url
#        else:
#            avatar_url = base_scheme + '://' + avatar_url
    avatar_url = avatar_url.replace('{size}', '45')
    if not os.path.exists(os.getcwd() + '/images/' + avatar_file_name):
        try:
            response = requests.get(avatar_url, stream=True)
            img = Image.open(BytesIO(response.content))
            img.save(os.getcwd() + '/images/' + avatar_file_name)
        except Exception as err:
            template = "An exception of type {0} occured. Arguments:\n{1!r}"
            message = template.format(type(err).__name__, err.args)
            print('in post_row error:', 'write avatar', avatar_url, message, cnt, topic['slug'], "\n===========\n")
            #sys.exit(0)

    user_name = post_json['username']
    content = post_json['cooked']
    
    # Since we don't generate user information, 
    # replace any anchors of class mention with a span
    soup = bs(content, "html.parser")
    mention_tags = soup.findAll('a', {'class':'mention'})
    for tag in mention_tags:
        try:
            rep = bs('<span class="mention"></span>', "html.parser").find('span')
            rep.string = tag.string
            tag.replaceWith(rep)
        except TypeError:
            pass

    img_tags = soup.findAll('img')
    for img_tag in img_tags:
        img_url = img_tag['src']
        parsed_url = urlparse(img_url)
        path = parsed_url.path
        file_name = path.split('/')[-1]
        if(parsed_url.netloc and parsed_url.scheme):
            pass
        elif(parsed_url.netloc):
            img_url = base_scheme + ':' + img_url
        else:
            img_url = base_url + img_url
        #response = requests.get('http:' + img_url, stream=True)
        try:
            response = requests.get(img_url, stream=True)
            img = Image.open(BytesIO(response.content))
            img.save(os.getcwd() + '/images/' + file_name)
            img_tag['src'] = '../../../images/' + file_name
            # print('good', file_name, img_url)
        except Exception as err:
            template = "An exception of type {0} occured. Arguments:\n{1!r}"
            message = template.format(type(err).__name__, err.args)
            print('post_row', 'save image', file_name, img_url, message)
            img_tag['src'] = '../../../images/missing_image.png'
            #sys.exit(0)

    content = ''
    for s in soup.contents:
        content = content + str(s)
    
    post_string = '      <div class="post_container">\n'
    post_string = post_string + '        <div class="avatar_container">\n'
    post_string = post_string + '          <img src="../../../images/' + \
        avatar_file_name + '" class="avatar" />\n'
    post_string = post_string + '        </div>\n'
    post_string = post_string + '        <div class="post">\n'
    post_string = post_string + '          <div class="user_name">' + user_name + '</div>\n'
    post_string = post_string + '          <div class="post_content">\n'
    post_string = post_string + content + '\n'
    post_string = post_string + '          </div>\n'
    post_string = post_string + '        </div>\n'
    post_string = post_string + '      </div>\n\n'
    return post_string


# The topic_row function generates the HTML for each topic on the main page
category_url = base_url + '/categories.json'
response = requests.get(category_url)
category_json = response.json()['category_list']['categories']
category_id_to_name = dict([(cat['id'],cat['name']) for cat in category_json])

def topic_row(topic_json):
    topic_html = '      <div class="topic-row">\n'
    topic_url = 't/' + topic_json['slug']+'/'+str(topic_json['id'])
    topic_title_text = topic_json['fancy_title']
    topic_post_count = topic_json['posts_count']
    topic_pinned = topic_json['pinned_globally']
    try:
        topic_category = category_id_to_name[topic_json['category_id']]
    except KeyError:
        topic_category = ''
    
    topic_html = topic_html + '        <span class="topic">'
    if topic_pinned:
        topic_html = topic_html + '<i class="fa fa-thumb-tack"'
        topic_html = topic_html + ' title="This was a pinned topic so it '
        topic_html = topic_html + 'appears near the top of the page."></i>'
    topic_html = topic_html + '<a href="' + topic_url + '">'
    topic_html = topic_html + topic_title_text + '</a></span>\n'
    topic_html = topic_html + '        <span class="category">'
    topic_html = topic_html + topic_category + '</span>\n'
    topic_html = topic_html + '        <span class="post-count">'
    topic_html = topic_html + str(topic_post_count) + '</span>\n'
    topic_html = topic_html + '      </div>\n\n'
    return topic_html
In [5]:
# The action is just starting here.

# Check for the directory where plan to store things.
# Note that this will be overwritten!
if os.path.exists(path) and os.path.isdir(path):
    rmtree(path)
os.mkdir(path)
os.chdir(path)
os.mkdir('images')

# Grab the site title and logo - available via the API but only after login
# so we'll grab this one thing via Beautiful Soup.
response = requests.get(base_url)
soup = bs(response.content, "html.parser")
site_title = soup.title
site_logo = soup.find("img", {"id":"site-logo"})
if site_logo == None:
    default_discourse_logo = b'iVBORw0KGgoAAAANSUhEUgAAArIAAAC4CAMAAAAoo//9AAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAADeUExURUxpcSMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfICMfIPDqieYaJACpUACu8PJcIQCoSgCz+vFSHiQYFvDxjvzujPAbJPJhIxgfIB4eIAGq3uw7IukrI6kcIwx7pAKreiKyV4MdIkAeIACrmV0eIN3mhQCtvFW/ZPHZfRw9SvJ7OMsbIxFniMDeft0aJBZRZgePw/GuXfKLRKXXd/HFbn/Mbh4sNO6dUSweHxcaH71YKW9IJ3fuwocAAAAYdFJOUwA4KcX+pOII+hIDHHvs9WNV2rRti89HlfziqaEAAB5LSURBVHja7J0Jd5roGsfLKvsOYoyJ08QsOq2ZdOo1vd7GZiaZ6ff/QhcUFQTeBYH06PM/p7Mlh4GXnw//Z5H3w4capXRkxnI9weFs0QgTSaLNOYLnWozcUT6AQL+KOmpgeo4t6lJYKEkXfcczA7UDawV6f1xZK6JVD/HiddvRAhawBb2fFDXQHFEKKWTYjsvIsHSg95BqdX0jrCCd0xiItaCWJQcVeU2odVwW8jFQe4aA1bgDeF07W1EIwCCAWgqwgsiHNcjgXBWWE9Q4sJajh3VJ8jUWlhTUaMplHuwI9vyB7QG0oOYirMlJYe2yNbAHoEbUsRwjbEK8b0IiBqq/SsAIetiUJCeAQi2oZhPriWGT0rtgaUG1egIubFq+CYEWVJdYwQiblyFAoAXVFGL9sB3ZFgRaUB0u1gjbkuFBvQt0qBiOD9sT7zCw5KCDTIEphu3KtmDCC1RdcoumYFvu0qCvAKpcKXD4sH1JAhhaUFUbG76LeAeqXaAqsuzwvcRBEgailtJ64pVphQVwB0CUpQJXD99TNjALoiNWM8IQmAWdALG3GwGzoF+c2Nvbm5ubiFSef3t74/ntvwOzoBYyL5eS2IjOW/7hcTqdTJ6enhaLRfTXyWQ6fXxY/ahyDgZ1AxCZTKrMKwqmb4/TydPi7Hqts7PtPy2eJtPHt8rUclCfBZHIEmni6+3DdLJISM1rxe3k8aEitQ70wUB4MTZFgI14PSvDNYVtRG0UayswK8C8AQgnliMPsI9xfD0j0fX10/ShArSSB0PfILRkhwLYsyJeF2XQnk0eb6mhNUy4JyBkecsjm92Kgc0H2JjW8fj+fpjS/f14vKH2bEIfaUUodYGQxQKDLMQ+5IBd4RrBOhhFGowyf4vAHe+gvYWyAai+1IuoWHDzNs1bggjXmM9BoVbcrrC9XkxDykArgJ0FlUnliELs49N1Ia8DpDbUXtMGWsMlvgAF9r0BI0sSYsd4XnfUjuNAS5eG2URdMEW1tK4gaBa8Hvx0ZBEY2ZuH/RB7PyzmNZ2DDVPUDu+v40BL1VEgqM6qWvIifAle/3kyYn0SU7C4xgMbQzoazefL5fI1+rOcz0eDLbgraJ8eacyBpGFPPkh/dx2+pXsitqBLMK2VMQWLyBLkgI1pnb8+v8xmvY8b9Xqz2cvz63y0xjaCdrygYlbEWYO9JrPuArNgC2Jiw+l11sPuAxvjGtG6AXWn9X+JuI2xXUM7pZlMxFQNgv1Kh27BDYVqQUTsJEPs/WC0z+tyjWuvRGtslzG1o8E9DbPoJljBudvgZ49eGo8tFWSJHe4BO5g/zxC4prCdPc+jXx8Np/+QM+ujZrrcgnPvgjU4cmGbCHsxdpwNsXGA7eF53VDbe1lG0A4mPHnhwCs/d7noASFCmD3y3EvAZl4ZYrOmYDh6nZHyuqF29joaDv59+KeGDIwpGknnYZ7muBXo2FpBhtgMsIMlJbAJtMsBDbPlT3qzOCyDMzjqIIsbObzJEJuxscP5Cz2wa2hfIk/7SMqsXjrSpRX3H2A04aiDLKbAdfOYrW2lPcFzrxKwK2h7z6PhT9IkrLTQ5QGyEGSRXdoMsfNZZWBX0M7mEbMHhlkXkIUgi0i90jF2ODggxG4C7Ssxs2Vh1uLpvC/o+IPstDjGDkcvBwIbM/vl67+EflZnaAp0LtzYky0X3DykRmHSxB5oChJd/fhMymxJ4Cz8yInw3o7jlSKQ24JUdWu47NVBbMTs14ufD0QpWFkX1pTguwwnJVbEVAuKiX39WA+xvasvn/s/34he710yhFjwtWAIsscs9HTB7duuWjAepInt1aWrH/3+Bf8byVcXS2a9c68LMcDJHrFkjjj32hnZOomN9Pmi/00iYNawyux4lllDA1twxELPyd6mcq/GiI3C7MWn30nibKlDzeyoJ7pA7OkmX6kgO26K2FWYvfjjdwJky+ezVM1fJ2G8KICPPe7ky8YE2a0tqL9WkAmz/T+/4ZnlER5VtTxBELomCyH2yH2BRBhkt9WC4bxuYtdh9tP5//DMQhsWfIFAWC4Y73pes/qJvfrron/xx/l/sMzqMLl96lJtwprsLvd6qZ/YVW32on/+HZuC8VC9Al+AjLLbxtc296o/9dq0wKIwe4m3s+AMTt0XeOjpglxJthEjGyP7334UZu8usdYAvjp74kL3EXbJ17hRW7BOwPpxmP2O6yhIFty1kxajk/mCbZBdNkXsKgGLwizWGnhw105aJk9UlB03WS3YDRrEYRZbNeBgx4+TtrJdsnrBpiY7fG6K2MQZfDq/xDXB4AUFYGXLkd36gkGzude2ZtC/+BMbZnkws6cs5KjsbbjpI4zbCLKxmY2dweV3MLOg8qqsQWRlk+Sr0SC7KnPF7YRzXKELKrOnLJfMyg5aCLK93pfP/ZUzwLlZH3YJPV2h34K8rcomvqDBcsE6zH5OagYYN6vDbOEJZ18OUfaV1AsaatXu5V+fznFFg4L8S3X3hSwrdFQ2sCzXsgJWrV4y68gqEx3GjI6zOtABfkWRV2dkmusj/TpvYJCj0zKjc5LLzptl1qfNsBWvP7l0i3AVMTMxm+xrsznHrGFk4/yr3787v7z7G8ls/kuLjMHvqfy9h3KgObZuGIYU/dFtx7PoB2wVmTE9wRdXh+Hj4+g2F2+GU+W2yYzbTc4oOaWuyxBhq+RE8bvlP9x+JgOP01frJHIak7u0DuMKXLwE8WnrIidEZ029jKtLj699vYoibhWRL5W95RNkx9sKV7O6+nGxNrPnl+jBWSF3Z5jccE8Jsgq72bgm9UUxW7Bo1lpRrehWFo0TGaJTcGvRoZrROD3Xz+F1zguw59RxvT2Vf4GI2f9VLZURBNkfrV8kpTBCujWqa9lVly1H5PfP2nEpdrCS408ET7mKlo6clV1krWyzydcW2cjMXn7/ja5kQIos6xV/SA3OlIkZ83xUnUV3TPL0MLrvpbfA4DTM/c9X1cs7gy6qI6MVPMVkT0dMfXYsrnAEkLc9hgxa1eUQyyg6JVHElUhqXImVbTj5SqpcKzN7jnYGvlwN2Y5Vvk2UxFmdAxnbHsrXyBp0sskhRz95zA5mVMhKKGT5HLKswyMm6FihHDdRI/jIqpqN2eigJIp4RC/oXFdlh/OPvVaQ7UfIop1Bfv6QCFnZQ36X2OhiQesEDtG21KFNcNuUwJEOO1CDyLIcyo0FyB3ieC7ABNqO6RPs41kYRQQyZNvxBb2rL/3+Ov/C1AzyVS4SZFUBt0xcgOkVCnpIKB4btFVPP/RAzSGrOog3SCgu7sx1TUavI9kHvyiKkCHb9KDsHrKr/OsOhZcRVEBWdfBLJJoKKtmxqTaL9pCBNuCID9RV20ZWyW9mvOvf4ImNAqRQfvGK5ZOvor8fsDmS5tc46SP0GtcOWXQ3IV+YxSOL3b8keS+SUh4apJBOHIPAX6zhQI0hW5CWe2Q5+y5FLmO2o+k0i6jvFUFokF02HWS3UTbuf6HNrEmNrKKRAVfKLMOF1CrdShfjqkmjf0PI8h6HeIM6Q/isKWFW9ig/+YaXuSabxBisa1xNt772kUWaWY0aWYY0qhXvJKpYdlhBulnoQ2XqgK0XFlybQtaWyufqZYHwlHlBLrx0nnYRpW76QCIJsusa1/ClNWQ/xch+rxXZjkMeGguewoqph5VUGLQr3Lbi9+I1ZQxQK24Rf9p4Tanl0rPMUiDbeFU2hWxc5br7m2ZiFodsQP4g5ljKlgsmztZz2wrhbw/ZbY1G5g454w6tK0iY9TpkyG5aCQmyzWdf67rsGllk/sXnOrYYZDsCObEFQTYQw8rKbaNT8bYVOZb2kN32Gy2DnNjcc0FxjYMfVuTINt9IiJG92CGLzL9yr+zEIMuKhxDL2OEB8veidtXbVuBYWkN2O2SkCIc8FVB7cqxmmcrzz4AM2eR9XO0h+6MpZM3ifjhPRKzshAfJkWuK2JzaOrIix8UDFdt2o+oTLmQRsWxJPZaPx7dMy7Li4Tgdc+2YTRLSyC5bQPavNLLfakTWK+hge2ZgaY6IJbagrL5bbN32uVi+XRpAJCcdZtVyLyhFB/N9US//33mddpEVNUaW1cCztzsBFbz2whZcK3C72VGhImJL3JkopN6uqqiMVjx3sdmRGIPsJIXsawvIfm0I2Xy9QExGLjqsZmNK9qVPM8PvmgwrxwOmHZkNTKFowIvPDneU8i86WsCwqqqyTDzZSOaLG0b2/8ydCXPbxg7HTfEWSVG8JCdyHadKZ2zXZ5xM2niavrxk3vf/SI+ntMvFYrE81HKmM00b09LytyAW+AMouuUIDuZdGAro2a3aLFgz8osICpSAbrCTCzJDPzYt+XdXIPvIIPv3CZB9YJHFArNbveOX8GjZo8zikOsHiZUdkR1701NhL7mn1j71LKDwn3D60KW/kYgZekTOiywU7RMkjEzw1Y8LjFhoIb0UVCoGUIKs9a9I05NuT4VsXa7YphJwZDXjsoIHxgluu5w3nBaF9ZmRDWqQ3TVnIJy+CBEOD0eixNA1wGyDl50OWTCnEqJ2v81pg8RCCylVzywBqViry1Eg24QMbk+j4+piXKdAtmelF6kkHis7bxRSdRWjz4q2gkIbDA8XoFTPB8Un/AlsVmTBQSp9ZB3+JeJX5hEmFnDiI0SEBOTHGzOLI9uGDG5PlfxqAgatLOZ0VrbRJcLEgg/TShFhbZfaLe2wGEmHjKzsZstNojKzcyILlzGHCu96GUcwsUDQBpEgnUGC3Ua/pwi4tOevU1nZz2RkNWUxqC/b2jRYLAUZWcvG5dtG+TOrFLLDkCebym8GMcsxOSeycH+TWGmL17DpFHertcUFxaIqpE4gqZBtzl9XJ/JlG1e2lnjjyOqKD8XwSrKmFRWCw3FVVWKGCdeAQD37CqwSAsgTc7P6ZkRW0t9f8Gy8La3QTYyOmaofFD5xHR1WIft0KKQ5QU1C58ruLlTIaku8Q/G9t6WUZ0FvclP9gwH8NIAS/Aitg4AUk2yuekZkJe1NxDSiRauZC9G9B8sUUygHp0C27SNXRbnmj8u2iYROYoDFZbULaaBjT0KoKQQi587mbOgliqAgsZMi8cbWvc2IbOGT9/DK3rj6e58wyF3wo6rtqkoeNp7B7WmQfbhkAgYostrlinBwVQ1tZulChtlMMQen7C4m7jW27/6MyMqIyuCQnwpawTpTmlQJnCcBAdmnLso1e8K28wva0xeqMdAvCpdkJBXQAinGES3sRMDUA6EAq8YE6GZEVtYQVTKLUwWt8IJJF+oryAE3SoVs6xlcnUAW0/kF7ekLRVa/9UYgq5BDoRWdTyscTCxwBE7U/ItZTobKGZGV9oeS9RHAoRVc2SghXILcIlMj23oGpTM7f7ViGy9oXVlUL7vVb3AkbTJiIdCKruyYpveiK7tVOxnirmE+wnzIyg9H8pHHCLTuSDEcs2BKZBvNbOXMzt2o8/tux7myWFUC0EZOhSyiIJRDK4a47BHdDQccmiHfhAmXzIcs4moiHYak0PrFNMimvhrZNpswf/qryyMcXFmk9guY/KUuCsd02laR+aSXINJPUX362g4y2eIJMDsBsg5Sz47VAUW2QYuNDbsKErJPbWR23sDs8fDVRGXRsCxgAgitN9D6LS8FEv2iBD8aHuECXo6kaVBixiw/AbKYl433MEkgqYsRTYNs+ZkJ8NfT6krPYN6QwdHINn4BHjAIhiCrqJKNxOYuImTJiJb3rjnIy1hEcg/4n0EWSP/zCgyDEKsbdpUmg4BsY2Zn7hZzMLKdX4CevtLlEGRVzIpLLRJRjHBlRX+ONFkncOSkz4cs/kVxZs8doVCRXkmuQHZNQbYxs1UC7L+/nsDItvECtCcXEGgidT5U9c/oN3cRiRgzC8dPCN+DQvopkDXxL7rA69pXfeHQVMiuYhKytWr2dlaVwSFccPQL7rROX9SWyIouRb2eLP8OZM2ZkE1GIKvs0NRTVE6FrJeRkG1is1e/zenMPnRGts0joK4sdNCmdvEOtniLWS4dKxBh/cuQ1TnRCZEH9kSgjeyZq3hj8ZrKUyNbK73LA9hskdlj4qszshcXP3VyXxqzEtw12ow3ik9oZfNhP8Yga9ORzc8nRVbZJJaTaJ4a2abT0dVsngFz9joYWawjF/SkychWpS7Yt2YDksB7dwyyaF9suqCEiRiMQ9ZHkKXsTdm4BEA/dHJk63zC7Xxhrs+CkcX8Am8zDtn+pBUxwTJTkCsdZLOxuCwgXacjy26/IVa22oVZ4WEBqcmDXBYd2dqbnWnuF+MWdEYWne4BYqODbF2/Ku+UwfygTUhi0JG1B20AMWmcIXJGJ6AjuxyNbDVWRu5nMZtfSCUk9rBro2NlK89gDs3s0S1gjCxW9wW+TvWQRaFl3q3hlAlbQM1E2QA5pkwQ/mckzQHbWHR7MLIYtMwHFbyb1HcHXUsisrUEsUqAzYDs/uVBNLJo10M4eqWJLAIts9LxEO2V3F5a4mtugAfMgq6xC2xs649AFoGWicYVk3lYRGSryOzNLCGD/dGRPSa+3n3D08yTICv3aY8rLabGxzizoleaDigm4Y5NGVkcJnrS+dlEyNbQgt7sQuoUrTbzIlt3jZnHL9gzxHZCWc2mh4ORlU3eOmIpCsO9EZ6BKH1VewZAUS67AIIAXGq4xc4X4XTISiZCMU0XwsleV0RkHz/UxTTTZ2yPRy/GLUCNrISZYci27U2kFgDopmqOGCwu3k354ICK1pgYTlDciCusH41suftN7Ngh7K3BryuiY1Cevm7myH7tf+x2gluAG9lkMSmyoO4gQ5xZLxtuZrNzXTMLUM4pWUWiU8mWEhsQbCZFFpoEaB33t1DLMLgiySGfvq5m0HhzxD5fUIysLPw+GFlIR5tj0uSEUhoKP3BANZr6mp+N93/F1JgsZBCiZ4IpkAV0tEdTCkT4FnMiW56+bmcYbc8Re3Bk8SF1MredhuwC6hKxFJ4XlhFFOxJ1INnwYEWgAspDa8yBttd8wARoGwRbL9Er50THusi64DB0oR6T2T8xuex8CmSr01fpF0ycrt1zxB7jW/jAL5krSUI2SMFee4IpZRYTaONrqRocVRNn4MHDQLukKNYxW33jBJzO4Kp1sRyHs9aayLrhCupLJnya1QJ5X61ULlawcAcj+1j5BRNHuNjoVkXswZFFx9pLD8SkscspPPJNCH0yyIJ9fPE2co1PF0GGFipNlzMLjonOlwr32MqhXZngSg09ZKvBMmD3x0yOLHCSjdb4yTN1tkATX4d6+po6wrV/4Yg9HL20p9rTkW3GAvbGS6qQhQeDpIg/20lywQnvoVqkyzwyhWpH4h4DJADmmu/Ep4VsW90BdJBGkIW6PjprxCuqIxBAyS6pKuH8z7c3ExvZ/fcHhthLhljULTjPl4OR7QbEifUywmPfop3a6q0TSx5pEB6XNAoDipmFJrhX3mJCWQDo4zlrgrnmMzJayHZnwqRflix4/uwvgY4F0i7eZ363jqt07esjW56+riaNcO3f/NhJiP3mvScqAzWRZQbEOTxK4lBW7gQDz2YTVrJhgx9MARhauAlAf6LCmbtJV7S3DNS8hSdhuUktlehYB9lj19t+WXK8wvxqqLmuZFaCy6Z4ViZ31iMV0nysqhWni3Dt96+MieW9AjRagEXelTNs2baXVpEdW8KLZUweaTDjygwNlgw32ORCrl0wI5IOIF6Rb4KOEt/IUlj+4IWUwFn598zDXKJgs43UhwINZLl+EOzICDfDE9LwQgL+qjAlxTOZLU1C9vHD7S9/zGRiOWIVjiwWeFcNBA35RbCS7Xrhu35ghInq10h7dliRaYfrjWEYm3WWpwlkF61+fwRkJFO6DbMsy21TOvkLiOJKWgdZTn23HBzsJCaf6MgaPdcmSjMj8N1gsQbeC7nqDNjcISzv0KyS6y/WW+AjW0W+WGo4Br9/uPllqgjX/g3rxfaJ/YoSi6U3cWSXsbgKnmOmZgHR0Q+kxYg+2fK8VXl5yChL3tBCXY6Z21mWrl8kF/wjN+tHb8nIAuXgVlSkqel4qCYOXUhrlaS2nYdbOzUj2ddxYjKyVeXX1W9vfp3IJ/i844i9v2aIvTtHkXWwczqKrN5o+v7b190Omewty3749uAbgeMwNEZ3y31iKrILrV/WDxG72xHL2D19CrJPb2+niXCVFrYH7DFLWx+9LNzI5suByK61OkKJ7kcwpm+ftfXHPHY2hQFjtNYvUhGEEkRkNRcin3AhVzHdly1PXzcTRLj2+5fvDxywlVOgQyyalMaQNfR6mAFkDKUMzu8axUQ3ws+H2K38Qchq/ibgrWgM7YDoha4Gso8f0AjXvrqUvL55/fFwyQPLOQVqYnHpD4psMnKhx4y3h0AbdDf5rLGF7u3ErzgHsmAabpOMfVWR2sjdyiNc+8p4fv7x+kaKbQX0S8krb2BLYDkTW/qxCmIVM3dwx0DDlZXMQhi41BLTOOBu2HS8tV5fQSC5P4djYAaTLSQr61AjW56+bv6D8VrSd7l7KLF92e8Zi9v+ocL1c5/Xyid4Zk0sgVhF+2AU2aXG8UumBjTMQUsd0EJFyhuh8jE8CiFaP3cosjoekqyv6QBmPfY4QED26S0olD3werCaDyW3319fX7rr9fV7SevD7lLg9XJ3zwP77uv79/oepk6Qi8qsfHLcItWOGwB6BlRAIL+RYh4ck9rTsliTBLl0whuDNj+/jISWyB8//CHhtXFOL+tr1/3brkS3vg5/hoC94JyC679UwCoFwYpUwjKmHcEw/TZY3oTeLHYxvRf9oO9kKmmpqqubxGINSCUQ99rxuAR9dZ2XwrnDLyMB2cebXhrhwGuL5P3zp+cdBCdwVX+Ndwmqg9cXJbErVY2BUmOwobyKC7TiwF3rvM5XtqFQSRPfkF5KqYMIafspglHSSNiS9toqxDaZm9GdA0HFRHAMfud7xDC81rReV+/1dxef7tXUVn/h/tNFD9iLr5aSWLWCXa3kWiiXGh0A3hpaarzMKmJlXeOCZGiTjFQguSTtp0IyvFNHFuNmjvr9oijDRFtMcXcStF7qiTT/+5NJIxzOW0dcD+Rdl9TKsb1seL3uA1uZWDWx6johgl7WjdGeh/3MKgyGsaVA6xUhpYDUXacqaJ2cXCK12KowkM/t1ZN4G/heWxFeCzKpmvrbq+d+Pd10RnbfZK8q9Drj2lwXDbnlP5+e7zvn9vKI6pFwntfKi6WYWLVbQC2kCeXQRrZBqqx3jbxQLHaUZtSKZ3+dRhj5+UKj3H+5QW0XNktOkJ7jjeexveaZMem14G4Ullby7dXIfmymKta8PnS4dqhef7u7+/rXl59f7hoa6//26fn5/v5oX+/vn1vCL/rAXtz9JJhYMCozrFzxLMhSSAjjJdsNvXguWNvFSsK+F5n5RqfVgb+RbAHPsWPdWn/5floVuYF8w7g/11Dlifn/b+9sdtyEoTBawCYwGGJ+k0WliK5YZDSaRTUSVVhUdNr3f6Iah3QIJWBIO8Xqd5YRMsac3NwY+5LRcOBPFPEj9U5vDHZzFUwQUu4t2674/enrp9ZXaat0taiqY9moWpO85SLtx3NqK8xt+fXBgLAqOcFeZUurUDZwetyIzK7BorSz8ooIyejWnFe75MHLGLXsqwVcxAlCi8bG/NIcXtasXOx2KbCjhJuLdqD2G5P9ipJsfAxdr8/0GJicCuPI1XnYzMt3jbg3jsRx/DRimXfrjkwr+0Pmr4+NroVUVZq6P4v6dlwjbTGo5iBCe1VhlQoHiF/YPrcHfeMZnCWy9uMuiTPTXVRr58E1DR4n53aahrixsKXrLtGEbUVLH5bTNBYnu7d+eXcUch4fdTO7nChhS88jxnF7ufbm4jNztJ3J1y6/fvvy+fH52AbV1tRB1fK8LquDirXioKqsFYWd2ocJ/jMmo+zrS+fnf+LYPCcy1I5qe5ABlqgKO1GbAkDZ3+NsrmxXa21ZFYdBb89pcDnD12YJzwZ3CcxSdjZC8fpUHqvuNJg0uGgS4XrWN2CyZhWAsn/K2nxfv5xOZVkej00iXJ5EKqySXMxYcgeg7F8Qt8uCJiwYC95R2fuxDNwhoJOyIYwFWimbZrg/QCdlkRUAvZSFsUAvZTG7BbRSdqJONoCyK2NkZyqAsivEj7GuAOikbJph7RbQSFmH4o8X0ElZnyGNBTopayEpADop6yeY2wIaKUssjhALNFLWZgixQCNlA8ViLQDKrmNmy+J4egD0UdaxYsxsAX2UvXrbIwBrVzaAsEAnZf3B920DsE5lnfH6kQCsSlmyoHoqAP9KWSeksYkACzRR1k93HL4CPZQNbFnmGo+5wOqVdQI/tCjjJnQFq1aW+LadWhGVxc9dJAPgfn4Cp8sJ3qZEE9QAAAAASUVORK5CYII=';
    with open(os.getcwd() + "/images/site-logo.png", "wb") as site_logo_fh:
        site_logo_fh.write(base64.decodebytes(default_discourse_logo))
else:
    # site_logo_image_url = base_url + site_logo.attrs['src']
    ## Looks like maybe the API changed?
    site_logo_image_url = site_logo.attrs['src']
    parsed = urlparse(site_logo_image_url)
    if parsed.netloc == '':
        site_logo_image_url = base_url + site_logo_image_url
    response = requests.get(site_logo_image_url, stream=True)
    img = Image.open(BytesIO(response.content))
    img.save(os.getcwd() + '/images/site-logo.png')

encoded_missing_image_png = b''
with open(os.getcwd() + "/images/missing_image.png", "wb") as missing_image_fh:
    missing_image_fh.write(base64.decodebytes(encoded_missing_image_png))
In [6]:
# This is where *most* of the action happens.

# The following bit of code grabs discourse_url/latest.json to generate a list of topics.
# For each of these topics, we apply topic_row to generate a line on the main page.
# If 'more_topics_url' appears in the response, we get more.

# Note that there might be errors but the code does attempt to deal with them gracefully by
# passing over them and continuing.
# 
# My archive of DiscoureMeta generated 19 errors - all image downloads that replaced with a missing image PNG.

max_more_topics = 5;
cnt = 0
topic_path = '/latest.json?no_definitions=true&page='
base_topic_url = base_url + topic_path
url = base_topic_url + str(cnt)
topic_list_string = ""
response = requests.get(url)
topic_list = response.json()['topic_list']['topics']
for topic in topic_list:
    try: 
        write_topic(topic)
        topic_list_string = topic_list_string + topic_row(topic)
    except Exception as err:
        #template = "An exception of type {0} occured. Arguments:\n{1!r}"
        #message = template.format(type(err).__name__, err.args)
        #print('in loop error:', message, cnt, topic['slug'], "\n===========\n")
        #sys.exit(0)
        pass
    sleep(1)  # Seems the polite thing to do
while 'more_topics_url' in response.json()['topic_list'].keys() and cnt < max_more_topics:
    print('cnt is ', cnt, "\n============")
    cnt = cnt+1
    url = base_topic_url + str(cnt)
    response = requests.get(url)
    topic_list = response.json()['topic_list']['topics']
    for topic in topic_list[1:]:  ## STARTED AT 1 'CAUSE IT APPEARS THAT 
                                  ## LAST THIS = FIRST NEXT   GOTTA CHECK THAT!
        topic_list_string = topic_list_string + topic_row(topic)
        write_topic(topic)
post_row save image 9372109 https://avatars0.githubusercontent.com/u/9372109?v=4 An exception of type KeyError occured. Arguments:
('',)
cnt is  0 
============
post_row save image 8a60a3c8092e2d47c70b2648aa4e6f7a1ac333f0.png https://meta-s3-cdn.global.ssl.fastly.net/original/3X/8/a/8a60a3c8092e2d47c70b2648aa4e6f7a1ac333f0.png An exception of type OSError occured. Arguments:
('cannot identify image file <_io.BytesIO object at 0x10b0a72b0>',)
post_row save image 755354 https://avatars0.githubusercontent.com/u/755354?v=3 An exception of type KeyError occured. Arguments:
('',)
post_row save image social-media-1.png http://www.progcode.co/img/social-media-1.png An exception of type OSError occured. Arguments:
('cannot identify image file <_io.BytesIO object at 0x10b98ceb8>',)
post_row save image 17538 https://avatars0.githubusercontent.com/u/17538?v=4 An exception of type KeyError occured. Arguments:
('',)
post_row save image 17538 https://avatars0.githubusercontent.com/u/17538?v=4 An exception of type KeyError occured. Arguments:
('',)
cnt is  1 
============
post_row save image apple-touch-icon-114x114-precomposed.png https://meta.discourse.org/img/apple-touch-icon-114x114-precomposed.png An exception of type OSError occured. Arguments:
('cannot identify image file <_io.BytesIO object at 0x10818ca98>',)
cnt is  2 
============
post_row save image 368961 https://avatars0.githubusercontent.com/u/368961?v=3&s=400 An exception of type KeyError occured. Arguments:
('',)
post_row save image 368961 https://avatars0.githubusercontent.com/u/368961?v=3&s=400 An exception of type KeyError occured. Arguments:
('',)
post_row save image 2060315 https://avatars.githubusercontent.com/u/2060315?v=3 An exception of type KeyError occured. Arguments:
('',)
post_row save image 2060315 https://avatars.githubusercontent.com/u/2060315?v=3 An exception of type KeyError occured. Arguments:
('',)
cnt is  3 
============
post_row save image 4335742 https://avatars7.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:
('',)
post_row save image 1385470 https://avatars3.githubusercontent.com/u/1385470?v=4 An exception of type KeyError occured. Arguments:
('',)
post_row save image 4335742 https://avatars7.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:
('',)
post_row save image 4335742 https://avatars0.githubusercontent.com/u/4335742?v=3 An exception of type KeyError occured. Arguments:
('',)
post_row save image 4335742 https://avatars3.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:
('',)
post_row save image 4335742 https://avatars3.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:
('',)
cnt is  4 
============
post_row save image 16833227 https://avatars3.githubusercontent.com/u/16833227?v=3&s=400 An exception of type KeyError occured. Arguments:
('',)
post_row save image 4335742 https://avatars.githubusercontent.com/u/4335742?v=3 An exception of type KeyError occured. Arguments:
('',)
In [7]:
# Wrap things up.
# Make the replacements and print the main file.
file_string = main_template \
    .replace("<!-- TITLE -->", str(site_title)) \
    .replace("<!-- JUST_SITE_TITLE -->", str(site_title.text)) \
    .replace("<!-- ARCHIVE_BLURB -->", archive_blurb) \
    .replace("<!-- TOPIC_LIST -->", topic_list_string)

f = open('index.html', 'w')
f.write(file_string)
f.close()


## Write out the CSS.

css = """@media only screen and (max-width: 760px) {
    body {
        margin: 0;
    }
    .header {
        position: fixed;
        top: 0;
        height: 80px;
        width: 100%;
        z-index: 100;
        background-color: #fff;
        box-shadow: 0 2px 4px -1px rgba(0,0,0,0.25);
    }
    .title-span {
        width: 100%;
        margin-top: 5px;
        margin-left: auto;
        margin-right: auto;
        height: 70px;
        margin-bottom: 5px;
    }
    h1.site-title {
      font-size: 200%;
      margin: 0px;
    }
    .archive-span {
        margin: 5px;
        color: gray;
    }
    .main{
        width: 90%;
        margin-top: 150px;
        margin-left: auto;
        margin-right: auto;
        margin-bottom: auto;
    }
    .topic-head{
        display: inline-block;
        width:90%;
    }
    .category-head{
        display: none;
    }
    .post-count-head{
        display: none;
    }
    .topic{
        display: inline-block;
        width:90%;
    }
    .category{
        display: none;
    }
    .post-count{
        display: none;
    }
}

@media only screen and (min-width: 761px) {
  .header {
      position: fixed;
      top: 0;
      height: 80px;
      width: 100%;
      z-index: 100;
      background-color: #fff;
      box-shadow: 0 2px 4px -1px rgba(0,0,0,0.25);
  }
  .title-span {
      width: 80%;
      margin-top: 5px;
      margin-left: auto;
      margin-right: auto;
      height: 70px;
      margin-bottom: 5px;
  }
  h1.site-title {
    font-size: 300%;
    margin: 0px;
  }
  .archive-span {
      color: gray;
      font-size: 120%;
      padding: 10px;
  }
  .main{
      width: 70%;
      margin-top: 100px;
      margin-left: auto;
      margin-right: auto;
      margin-bottom: auto;
  }
  .topic-head{
      display: inline-block;
      width:70%;
  }
  .category-head{
      display: inline-block;
      width:15%;
  }
  .post-count-head{
      display: inline-block;
      width:14%;
  }
  .topic{
      display: inline-block;
      width:70%;
  }
  .category{
      display: inline-block;
      width:15%;
  }
  .post-count{
      display: inline-block;
      width:14%;
  }
}

a {
    color: black;
}
a:visited {
    color: gray;
}

h1.topic-title {
    border-bottom: 2px solid darkgray;
}

.user_name {
    font-size: 110%;
    color: #555555;
}
.post_container {
    border-bottom: 1px solid lightgray;
    padding: 20px;
}

.avatar {
    border-radius: 50%;
}
.avatar_container {
    float: left;
}

label {
    display: inline-block;
    width: 5em;
}

.fa {
    padding-right: 5px;
}
.header-row {
    padding-bottom: 8px;
    border-bottom: 3px solid gray;
}
.topic-row {
    padding: 8px;
    border-bottom: 1px solid lightgray;
}

div.meta {
    display: none;
}
"""

f = open('archived.css', 'w')
f.write(css)
f.close()