En crawler til mappe-visninger på nettet

Hvis du har været på internettet, er du sikkert en gang stødt på sådan ét her:

Mange webadministratorer vælger at skjule disse oversigter over filer på en webserver, som webserversoftwaren Apache kan generere automatisk.

Men jeg opdagede ved et tilfælde, at jeg kunne se, hvad fotoagenturet Magnum havde lagt op i deres WordPress-installation.

Jeg besluttede at forsøge at lave en lokal kopi, så jeg kunne kigge på flotte fotografier uden at skulle vente på downloads fra internettet.

Først forsøgte jeg med Wget, som er et lille program, der er designet til at dublere websteder lokalt. Men Wget havde problemer med at hente og tygge sig igennem de lange lister med filer. En af dem fyldte fx 36 megabytes. Det er altså rigtig mange links.

Derfor lavede jeg et lille Python-program, der kan tygge sig igennem denne type mappe- og filoversigter og downloade dem lokalt.

Her er det:

# apache-directory-downloader.py
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
'''A program to fetch files from standard apache directory listings on the internet.
See https://duckduckgo.com/?t=ffab&q=apache%2Bdirectory%2Blisting&ia=images&iax=images
for examples of what this is.'''

import requests					# Send http requests and receive responses
from bs4 import BeautifulSoup	# Parse HTML data structures, e.g. to search for links
import os						# Used to create directories at local destination
import shutil					# Used to copy binary files from http response to local destination
import re						# Regex parser and search functions

# Terms to exclude, files with these strings in them are not downloaded
exclude = [
	"-medium",
	"-overlay",
	"-teaser-",
	"-overlay",
	"-thumbnail",
	"-collaboration",
	"-scaled",
	"-photographer-featured",
	"-photographer-listing",
	"-full-on-mobile",
	"-theme-small-teaser",
	"-post",
	"-large",
	"-breaker",
	]

# Takes an url and collects all links
def request(url, save_location):
	# Print status to let user know that something is going on
	print("Requesting:", url)
	# Fetch url
	response = requests.get(url)
	# Parse response
	soup = BeautifulSoup(response.text, "lxml")
	# Search for all links and exclude certain strings and patterns from links
	urllist = [a['href'] for a in soup.find_all('a', href=True) if not '?C=' in a['href'] and not a['href'][0] == "/" and not any(term in a['href'] for term in exclude) and not re.search("\d\d[x]\d\d",a['href'])]
	# If status code is not 200 (OK), add url to list of errors
	if not response.status_code == 200:
		errorlist.append(url)
	# Send current url, list of links and current local save collection to scrape function
	return scrape(url, urllist, save_location)

def scrape(path, content, save_location):
	# Loop through all links
	for url in content:
		# Print status to let user know that something is going on
		print("Parsing/downloading:", path+url)
		# If there's a slash ("/") in the link, it is a directory
		if "/" in url:
			# Create local directory if it doesn't exists
			try:
				os.mkdir(save_location+url)
			except:
				pass
			# Run request function to fetch contents of directory
			request(path+url, save_location+url)
		# If the link doesn't contain a slash, it's a file and is saved
		else:
			# Check if file already exists, e.g. has been downloaded in a prior run
			if not os.path.isfile(save_location+url):
				# If file doesn't exist, fetch it from remote location
				file = requests.get(path+url, stream=True)
				# Print status to let user know that something is going on
				print("Saving file:", save_location+url)
				# Save file to local destination
				with open(save_location+url, 'wb') as f:
					# Decodes file if received compressed from server
					file.raw.decode_content = True
					# Copies binary file to local destination
					shutil.copyfileobj(file.raw, f)

# List to collect crawling errors
errorlist = []
# Local destination, e.g. 'C:\Downloads' for Windows
save_location = "C:/Downloads/"
# Remote location, e.g. https://example.com/files
url = "https://content.magnumphotos.com/wp-content/uploads/"
# Call function to start crawling
request(url, save_location)
# Print any crawling errors
print(errorlist)