{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Archiving Discourse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This Jupyter notebook contains the Python code I use to auto-archive my Discourse instances using the API. You can read more about this in [my question](https://meta.discourse.org/t/a-basic-discourse-archival-tool/62614) on [DiscourseMeta](https://meta.discourse.org/).\n", "\n", "If you're reading the HTML version off of my webpage, the actual notebook file should be [here](ArchiveDiscourse.ipynb). There's also a [version on GitHub](https://github.com/mcmcclur/ArchiveDiscourse).\n", "\n", "As an example, let's try to archive a bit of https://meta.discourse.org. Note that this site is really too big to archive the whole thing - at least for me. You can adjust how much you want to download by fiddling with the `max_more_topics` parameter in code block 6 below. For the purposes of this demo, it's the parameter is set to 5. You can you can examine the archived version [here](https://www.marksmath.org/ArchiveDiscourse/meta_discourse/)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Archive meta.discourse.org\n", "\n", "# Be sure to define the base_url of the Discourse instance, \n", "# the path of the directory to save stuff on the local machine, \n", "# and a blurb to describe the site.\n", "\n", "# Note that the directory specified by `path` will be overwritten.\n", "\n", "import os\n", "from datetime import date\n", "base_url = 'https://meta.discourse.org'\n", "path = os.path.join(os.getcwd(), 'meta_discourse')\n", "archive_blurb = \"A partial archive of meta.discourse.org as of \" + \\\n", " date.today().strftime(\"%A %B %d, %Y\") + '.'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From here, the code should just work, *assuming* the following libraries are all installed. You might need to install `requests`, `PIL`, and `BeautifulSoup`; they don't come with every Python distribution." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import requests, base64, sys\n", "from urllib.parse import urlparse\n", "from bs4 import BeautifulSoup as bs\n", "from PIL import Image\n", "from io import BytesIO\n", "import requests, base64\n", "from urllib.parse import urlparse\n", "from time import sleep\n", "\n", "from shutil import rmtree" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# When archiving larger sites (like meta.discourse.org), you might need to \n", "# increase the number of retries to connect.\n", "# Doesn't seem to be necessary for my site but it *is* necessary for Meta.\n", "\n", "from requests.adapters import HTTPAdapter\n", "\n", "s = requests.Session()\n", "s.mount(base_url, HTTPAdapter(max_retries=5))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Templates for the webpages\n", "base_scheme = urlparse(base_url).scheme\n", "\n", "# Template for the main page. Subsequent code will replace a few items indicated by\n", "# \n", "main_template = \"\"\"\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
\n", "

\n", " \"<!--\" id=\"site-logo\">\n", "

\n", "
\n", "
\n", "\n", "
\n", "
\n", "
\n", "
\n", " Topics\n", " Category\n", " Posts\n", "
\n", " \n", "
\n", "
\n", " \n", "\n", "\"\"\"\n", "\n", "\n", "# Template for the individual topic pages\n", "topic_template = \"\"\"\n", "\n", " \n", " \n", " \n", " <!-- TOPIC_TITLE -->\n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "
\n", " \n", " \"<!--\" id=\"site-logo\" />\n", " \n", "
\n", "
\n", "\n", "
\n", "
\n", "

\n", " \n", "
\n", " \n", "\n", "\"\"\"\n", "\n", "\n", "\n", "# Function that writes out each individual topic page\n", "def write_topic(topic_json):\n", " topic_download_url = base_url + '/t/' + topic_json['slug']+'/'+str(topic_json['id'])\n", " topic_relative_url = 't/' + topic_json['slug']+'/'+str(topic_json['id'])\n", " try:\n", " os.makedirs(topic_relative_url)\n", " except Exception as err:\n", " print('in write_topic error:', 'make directory')\n", " response = requests.get(topic_download_url + '.json')\n", " posts_json = response.json()['post_stream']['posts']\n", " post_list_string = \"\"\n", " for post_json in posts_json:\n", " post_list_string = post_list_string + post_row(post_json)\n", " topic_file_string = topic_template \\\n", " .replace(\"\", topic_json['fancy_title']) \\\n", " .replace(\"\", str(site_title.text)) \\\n", " .replace(\"\", archive_blurb) \\\n", " .replace(\"\", post_list_string)\n", "\n", " f = open(topic_relative_url + '/index.html', 'w')\n", " f.write(topic_file_string)\n", " f.close()\n", "\n", "# Function that creates the text describing the individual posts in a topic\n", "def post_row(post_json):\n", " avatar_url = post_json['avatar_template']\n", " parsed_url = urlparse(avatar_url)\n", " path = parsed_url.path\n", " avatar_file_name = path.split('/')[-1]\n", " if(parsed_url.netloc and parsed_url.scheme):\n", " pass\n", " elif(parsed_url.netloc):\n", " avatar_url = base_scheme + ':' + avatar_url\n", " else:\n", " avatar_url = base_url + avatar_url\n", "# if(not parsed_url.scheme):\n", "# if avatar_url[0] == '/':\n", "# avatar_url = base_url + avatar_url\n", "# else:\n", "# avatar_url = base_scheme + '://' + avatar_url\n", " avatar_url = avatar_url.replace('{size}', '45')\n", " if not os.path.exists(os.getcwd() + '/images/' + avatar_file_name):\n", " try:\n", " response = requests.get(avatar_url, stream=True)\n", " img = Image.open(BytesIO(response.content))\n", " img.save(os.getcwd() + '/images/' + avatar_file_name)\n", " except Exception as err:\n", " template = \"An exception of type {0} occured. Arguments:\\n{1!r}\"\n", " message = template.format(type(err).__name__, err.args)\n", " print('in post_row error:', 'write avatar', avatar_url, message, cnt, topic['slug'], \"\\n===========\\n\")\n", " #sys.exit(0)\n", "\n", " user_name = post_json['username']\n", " content = post_json['cooked']\n", " \n", " # Since we don't generate user information, \n", " # replace any anchors of class mention with a span\n", " soup = bs(content, \"html.parser\")\n", " mention_tags = soup.findAll('a', {'class':'mention'})\n", " for tag in mention_tags:\n", " try:\n", " rep = bs('', \"html.parser\").find('span')\n", " rep.string = tag.string\n", " tag.replaceWith(rep)\n", " except TypeError:\n", " pass\n", "\n", " img_tags = soup.findAll('img')\n", " for img_tag in img_tags:\n", " img_url = img_tag['src']\n", " parsed_url = urlparse(img_url)\n", " path = parsed_url.path\n", " file_name = path.split('/')[-1]\n", " if(parsed_url.netloc and parsed_url.scheme):\n", " pass\n", " elif(parsed_url.netloc):\n", " img_url = base_scheme + ':' + img_url\n", " else:\n", " img_url = base_url + img_url\n", " #response = requests.get('http:' + img_url, stream=True)\n", " try:\n", " response = requests.get(img_url, stream=True)\n", " img = Image.open(BytesIO(response.content))\n", " img.save(os.getcwd() + '/images/' + file_name)\n", " img_tag['src'] = '../../../images/' + file_name\n", " # print('good', file_name, img_url)\n", " except Exception as err:\n", " template = \"An exception of type {0} occured. Arguments:\\n{1!r}\"\n", " message = template.format(type(err).__name__, err.args)\n", " print('post_row', 'save image', file_name, img_url, message)\n", " img_tag['src'] = '../../../images/missing_image.png'\n", " #sys.exit(0)\n", "\n", " content = ''\n", " for s in soup.contents:\n", " content = content + str(s)\n", " \n", " post_string = '
\\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + ' \\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + '
' + user_name + '
\\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + content + '\\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + '
\\n'\n", " post_string = post_string + '
\\n\\n'\n", " return post_string\n", "\n", "\n", "# The topic_row function generates the HTML for each topic on the main page\n", "category_url = base_url + '/categories.json'\n", "response = requests.get(category_url)\n", "category_json = response.json()['category_list']['categories']\n", "category_id_to_name = dict([(cat['id'],cat['name']) for cat in category_json])\n", "\n", "def topic_row(topic_json):\n", " topic_html = '
\\n'\n", " topic_url = 't/' + topic_json['slug']+'/'+str(topic_json['id'])\n", " topic_title_text = topic_json['fancy_title']\n", " topic_post_count = topic_json['posts_count']\n", " topic_pinned = topic_json['pinned_globally']\n", " try:\n", " topic_category = category_id_to_name[topic_json['category_id']]\n", " except KeyError:\n", " topic_category = ''\n", " \n", " topic_html = topic_html + ' '\n", " if topic_pinned:\n", " topic_html = topic_html + ''\n", " topic_html = topic_html + ''\n", " topic_html = topic_html + topic_title_text + '\\n'\n", " topic_html = topic_html + ' '\n", " topic_html = topic_html + topic_category + '\\n'\n", " topic_html = topic_html + ' '\n", " topic_html = topic_html + str(topic_post_count) + '\\n'\n", " topic_html = topic_html + '
\\n\\n'\n", " return topic_html" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# The action is just starting here.\n", "\n", "# Check for the directory where plan to store things.\n", "# Note that this will be overwritten!\n", "if os.path.exists(path) and os.path.isdir(path):\n", " rmtree(path)\n", "os.mkdir(path)\n", "os.chdir(path)\n", "os.mkdir('images')\n", "\n", "# Grab the site title and logo - available via the API but only after login\n", "# so we'll grab this one thing via Beautiful Soup.\n", "response = requests.get(base_url)\n", "soup = bs(response.content, \"html.parser\")\n", "site_title = soup.title\n", "site_logo = soup.find(\"img\", {\"id\":\"site-logo\"})\n", "if site_logo == None:\n", " default_discourse_logo = b'';\n", " with open(os.getcwd() + \"/images/site-logo.png\", \"wb\") as site_logo_fh:\n", " site_logo_fh.write(base64.decodebytes(default_discourse_logo))\n", "else:\n", " # site_logo_image_url = base_url + site_logo.attrs['src']\n", " ## Looks like maybe the API changed?\n", " site_logo_image_url = site_logo.attrs['src']\n", " parsed = urlparse(site_logo_image_url)\n", " if parsed.netloc == '':\n", " site_logo_image_url = base_url + site_logo_image_url\n", " response = requests.get(site_logo_image_url, stream=True)\n", " img = Image.open(BytesIO(response.content))\n", " img.save(os.getcwd() + '/images/site-logo.png')\n", "\n", "encoded_missing_image_png = b''\n", "with open(os.getcwd() + \"/images/missing_image.png\", \"wb\") as missing_image_fh:\n", " missing_image_fh.write(base64.decodebytes(encoded_missing_image_png))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "post_row save image 9372109 https://avatars0.githubusercontent.com/u/9372109?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "cnt is 0 \n", "============\n", "post_row save image 8a60a3c8092e2d47c70b2648aa4e6f7a1ac333f0.png https://meta-s3-cdn.global.ssl.fastly.net/original/3X/8/a/8a60a3c8092e2d47c70b2648aa4e6f7a1ac333f0.png An exception of type OSError occured. Arguments:\n", "('cannot identify image file <_io.BytesIO object at 0x10b0a72b0>',)\n", "post_row save image 755354 https://avatars0.githubusercontent.com/u/755354?v=3 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image social-media-1.png http://www.progcode.co/img/social-media-1.png An exception of type OSError occured. Arguments:\n", "('cannot identify image file <_io.BytesIO object at 0x10b98ceb8>',)\n", "post_row save image 17538 https://avatars0.githubusercontent.com/u/17538?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 17538 https://avatars0.githubusercontent.com/u/17538?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "cnt is 1 \n", "============\n", "post_row save image apple-touch-icon-114x114-precomposed.png https://meta.discourse.org/img/apple-touch-icon-114x114-precomposed.png An exception of type OSError occured. Arguments:\n", "('cannot identify image file <_io.BytesIO object at 0x10818ca98>',)\n", "cnt is 2 \n", "============\n", "post_row save image 368961 https://avatars0.githubusercontent.com/u/368961?v=3&s=400 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 368961 https://avatars0.githubusercontent.com/u/368961?v=3&s=400 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 2060315 https://avatars.githubusercontent.com/u/2060315?v=3 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 2060315 https://avatars.githubusercontent.com/u/2060315?v=3 An exception of type KeyError occured. Arguments:\n", "('',)\n", "cnt is 3 \n", "============\n", "post_row save image 4335742 https://avatars7.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 1385470 https://avatars3.githubusercontent.com/u/1385470?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 4335742 https://avatars7.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 4335742 https://avatars0.githubusercontent.com/u/4335742?v=3 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 4335742 https://avatars3.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 4335742 https://avatars3.githubusercontent.com/u/4335742?v=4 An exception of type KeyError occured. Arguments:\n", "('',)\n", "cnt is 4 \n", "============\n", "post_row save image 16833227 https://avatars3.githubusercontent.com/u/16833227?v=3&s=400 An exception of type KeyError occured. Arguments:\n", "('',)\n", "post_row save image 4335742 https://avatars.githubusercontent.com/u/4335742?v=3 An exception of type KeyError occured. Arguments:\n", "('',)\n" ] } ], "source": [ "# This is where *most* of the action happens.\n", "\n", "# The following bit of code grabs discourse_url/latest.json to generate a list of topics.\n", "# For each of these topics, we apply topic_row to generate a line on the main page.\n", "# If 'more_topics_url' appears in the response, we get more.\n", "\n", "# Note that there might be errors but the code does attempt to deal with them gracefully by\n", "# passing over them and continuing.\n", "# \n", "# My archive of DiscoureMeta generated 19 errors - all image downloads that replaced with a missing image PNG.\n", "\n", "max_more_topics = 5;\n", "cnt = 0\n", "topic_path = '/latest.json?no_definitions=true&page='\n", "base_topic_url = base_url + topic_path\n", "url = base_topic_url + str(cnt)\n", "topic_list_string = \"\"\n", "response = requests.get(url)\n", "topic_list = response.json()['topic_list']['topics']\n", "for topic in topic_list:\n", " try: \n", " write_topic(topic)\n", " topic_list_string = topic_list_string + topic_row(topic)\n", " except Exception as err:\n", " #template = \"An exception of type {0} occured. Arguments:\\n{1!r}\"\n", " #message = template.format(type(err).__name__, err.args)\n", " #print('in loop error:', message, cnt, topic['slug'], \"\\n===========\\n\")\n", " #sys.exit(0)\n", " pass\n", " sleep(1) # Seems the polite thing to do\n", "while 'more_topics_url' in response.json()['topic_list'].keys() and cnt < max_more_topics:\n", " print('cnt is ', cnt, \"\\n============\")\n", " cnt = cnt+1\n", " url = base_topic_url + str(cnt)\n", " response = requests.get(url)\n", " topic_list = response.json()['topic_list']['topics']\n", " for topic in topic_list[1:]: ## STARTED AT 1 'CAUSE IT APPEARS THAT \n", " ## LAST THIS = FIRST NEXT GOTTA CHECK THAT!\n", " topic_list_string = topic_list_string + topic_row(topic)\n", " write_topic(topic)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Wrap things up.\n", "# Make the replacements and print the main file.\n", "file_string = main_template \\\n", " .replace(\"\", str(site_title)) \\\n", " .replace(\"\", str(site_title.text)) \\\n", " .replace(\"\", archive_blurb) \\\n", " .replace(\"\", topic_list_string)\n", "\n", "f = open('index.html', 'w')\n", "f.write(file_string)\n", "f.close()\n", "\n", "\n", "## Write out the CSS.\n", "\n", "css = \"\"\"@media only screen and (max-width: 760px) {\n", " body {\n", " margin: 0;\n", " }\n", " .header {\n", " position: fixed;\n", " top: 0;\n", " height: 80px;\n", " width: 100%;\n", " z-index: 100;\n", " background-color: #fff;\n", " box-shadow: 0 2px 4px -1px rgba(0,0,0,0.25);\n", " }\n", " .title-span {\n", " width: 100%;\n", " margin-top: 5px;\n", " margin-left: auto;\n", " margin-right: auto;\n", " height: 70px;\n", " margin-bottom: 5px;\n", " }\n", " h1.site-title {\n", " font-size: 200%;\n", " margin: 0px;\n", " }\n", " .archive-span {\n", " margin: 5px;\n", " color: gray;\n", " }\n", " .main{\n", " width: 90%;\n", " margin-top: 150px;\n", " margin-left: auto;\n", " margin-right: auto;\n", " margin-bottom: auto;\n", " }\n", " .topic-head{\n", " display: inline-block;\n", " width:90%;\n", " }\n", " .category-head{\n", " display: none;\n", " }\n", " .post-count-head{\n", " display: none;\n", " }\n", " .topic{\n", " display: inline-block;\n", " width:90%;\n", " }\n", " .category{\n", " display: none;\n", " }\n", " .post-count{\n", " display: none;\n", " }\n", "}\n", "\n", "@media only screen and (min-width: 761px) {\n", " .header {\n", " position: fixed;\n", " top: 0;\n", " height: 80px;\n", " width: 100%;\n", " z-index: 100;\n", " background-color: #fff;\n", " box-shadow: 0 2px 4px -1px rgba(0,0,0,0.25);\n", " }\n", " .title-span {\n", " width: 80%;\n", " margin-top: 5px;\n", " margin-left: auto;\n", " margin-right: auto;\n", " height: 70px;\n", " margin-bottom: 5px;\n", " }\n", " h1.site-title {\n", " font-size: 300%;\n", " margin: 0px;\n", " }\n", " .archive-span {\n", " color: gray;\n", " font-size: 120%;\n", " padding: 10px;\n", " }\n", " .main{\n", " width: 70%;\n", " margin-top: 100px;\n", " margin-left: auto;\n", " margin-right: auto;\n", " margin-bottom: auto;\n", " }\n", " .topic-head{\n", " display: inline-block;\n", " width:70%;\n", " }\n", " .category-head{\n", " display: inline-block;\n", " width:15%;\n", " }\n", " .post-count-head{\n", " display: inline-block;\n", " width:14%;\n", " }\n", " .topic{\n", " display: inline-block;\n", " width:70%;\n", " }\n", " .category{\n", " display: inline-block;\n", " width:15%;\n", " }\n", " .post-count{\n", " display: inline-block;\n", " width:14%;\n", " }\n", "}\n", "\n", "a {\n", " color: black;\n", "}\n", "a:visited {\n", " color: gray;\n", "}\n", "\n", "h1.topic-title {\n", " border-bottom: 2px solid darkgray;\n", "}\n", "\n", ".user_name {\n", " font-size: 110%;\n", " color: #555555;\n", "}\n", ".post_container {\n", " border-bottom: 1px solid lightgray;\n", " padding: 20px;\n", "}\n", "\n", ".avatar {\n", " border-radius: 50%;\n", "}\n", ".avatar_container {\n", " float: left;\n", "}\n", "\n", "label {\n", " display: inline-block;\n", " width: 5em;\n", "}\n", "\n", ".fa {\n", " padding-right: 5px;\n", "}\n", ".header-row {\n", " padding-bottom: 8px;\n", " border-bottom: 3px solid gray;\n", "}\n", ".topic-row {\n", " padding: 8px;\n", " border-bottom: 1px solid lightgray;\n", "}\n", "\n", "div.meta {\n", " display: none;\n", "}\n", "\"\"\"\n", "\n", "f = open('archived.css', 'w')\n", "f.write(css)\n", "f.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 1 }