From 45b211785f8dfd4ab73aaba606ecb1bd09f195f2 Mon Sep 17 00:00:00 2001 From: Alexander Weinhold Date: Wed, 10 Feb 2016 22:34:38 +0100 Subject: [PATCH] well, git is good at preserving the symlink, that's for sure... --- download_xkcd.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) mode change 120000 => 100644 download_xkcd.py diff --git a/download_xkcd.py b/download_xkcd.py deleted file mode 120000 index 38eaee4..0000000 --- a/download_xkcd.py +++ /dev/null @@ -1 +0,0 @@ -../xkcd/download_xkcd.py \ No newline at end of file diff --git a/download_xkcd.py b/download_xkcd.py new file mode 100644 index 0000000..4d10c0a --- /dev/null +++ b/download_xkcd.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python2 + +import requests +from bs4 import BeautifulSoup + +FIRST_COMIC = 1 +LAST_COMIC = 1633 + +def download(img_url, i): + img = requests.get(img_url, stream=True) + file_name = "%04d_%s" %(i, img_url.split("/")[-1]) + with open(file_name ,"wb") as f: + for chunk in img.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + print "Got %s" %file_name + + +def run(): + for i in range(FIRST_COMIC, LAST_COMIC + 1): + p = requests.get("http://xkcd.com/%d/" %i) + soup = BeautifulSoup(p.text, "html.parser") + div = soup.find("div", id="comic") + if div: + img = div.find("img") + if img: + src = img["src"] + if src.startswith("//"): + download("http:" + src, i) + + +if __name__ == "__main__": + run()