Wallnots nye Twitter-robot

Opdatering: Jeg har lavet en ny, forbedret udgave af robottten. Læs om den her.

Både Zetland og Politiken har en feature, hvor abonnenter kan dele betalingsartikler med venner, bekendte og offentligheden. Artiklen får en unik URL, som låser op for betalingsmuren og lader alle og enhver læse artiklen.

Jeg tænkte at Wallnot – min hjemmeside med artikler, der ikke er bag betalingsmur – trængte til at være mere til stede på sociale medier.

Derfor har jeg lavet en robot, der gennemsøger Twitter for delte artikler fra Zetland og Politiken – og deler links’ne som tweets. Robotten opdater ca. på klokkeslettene 8.25, 12.25, 16.25 og 20.25. Det ville være skønt at kunne opdatere flere gange i døgnet, men så vil Twitter have penge.

Du finder Wallnots nye Twitter-robot her: https://twitter.com/wallnot_dk

Sådan ser det ud, når Wallnots robot tweeter.

Jeg brugte Python og modulet TwitterAPI. Hvis du selv vil køre programmet, skal du lave en udviklerkonto og en app hos Twitter. Se
https://developer.twitter.com/.

Her er det færdige program.

# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
# THIS PROGRAM POSTS NEW SHARED ARTICLES FROM ZETLAND.DK AND POLITIKEN.DK TO TWITTER

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import time
from nested_lookup import nested_lookup
from TwitterAPI import TwitterAPI

articlestopost = []

# API LOGIN - INSERT YOUR OWN VALUES HERE
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)


# POLITIKEN.DK SEARCH #
SEARCH_TERM = 'url:"politiken.dk/del/"'
PRODUCT = '30day'
LABEL = 'prod'

r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), 
                {'query':SEARCH_TERM})

tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json

# Looks for all instances of expanded_url in json	
linklist = list(set(nested_lookup('expanded_url', tweet_data)))

urllist = []
for link in linklist:
	if "politiken.dk/del" in link:
		urllist.append(link)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in urllist:
	try:
		data = requests.get(url)
		result = data.text
		if '"isAccessibleForFree": "True"' not in result:
			soup = BeautifulSoup(result, "lxml")
			# Finds titles and timestamps
			title = soup.find('meta', attrs={'property':'og:title'})
			title = title['content']
			timestamp = soup.find('meta', attrs={'property':'article:published_time'})
			timestamp = timestamp['content']
			dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z')
			realurl = data.history[0].headers['Location']
			if title not in titlecheck:
				articlelist.append({"title": title, "url": realurl, "date": dateofarticle})
				titlecheck.append(title)			
	except:
		print(url)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True) 

# Check if article is already posted and update list of posted articles
with open("./pol_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		url = art['url']
		token = url.index("?shareToken")
		url = url[:token]
		if url not in alreadypublished:
			alreadypublished.append(url)
			articlestopost.append(art)
	# Save updated already published links
	with open("./pol_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished)
		fout.write(alreadypublishedjson)

# ZETLAND.DK SEARCH #
SEARCH_TERM = 'url:"zetland.dk/historie"'
PRODUCT = '30day'
LABEL = 'prod'

r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), 
                {'query':SEARCH_TERM})

tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json

# Looks for all instances of expanded_url in json	
linklist = list(set(nested_lookup('expanded_url', tweet_data)))

urllist = []
for link in linklist:
	if "zetland.dk/historie" in link:
		urllist.append(link)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in urllist:
	try:
		data = requests.get(url)
		result = data.text

		# Soup site and create a dictionary of links and their titles and dates
		articledict = {}
		soup = BeautifulSoup(result, "lxml")

		title = soup.find('meta', attrs={'property':'og:title'})
		title = title['content']
		
		timestamp = soup.find('meta', attrs={'property':'article:published_time'})
		timestamp = timestamp['content']
		timestamp = timestamp[:timestamp.find("+")]
		dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
		
		if title not in titlecheck:
			articlelist.append({"title": title, "url": url, "date": dateofarticle})
			titlecheck.append(title)
	except:
		print(url)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True) 

# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		title = art['title']
		if title not in alreadypublished:
			alreadypublished.append(title)
			articlestopost.append(art)
	# Save updated already published links
	with open("./zet_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
		fout.write(alreadypublishedjson)


# POST TO TWITTER #
if articlestopost:
	for art in articlestopost:
		if "zetland" in art['url']:
			medium = "Zetland"
		else:
			medium = "Politiken"
		status = "En flink abonnent på " + medium + " har delt en betalingsartikel. God fornøjelse! " + art['url']
		r = api.request('statuses/update', {'status': status})
		time.sleep(5)

Sådan trækker jeg links til gratis Zetland-artikler ud fra Zetlands Twitter-konto til wallnot.dk

wallnot.dk udgiver jeg en liste over gratisartikler fra en lang række medier, der benytter sig af betalingsmure/paywall. Siden er ment som en service til brugere, der ved, at de gerne vil læse nyhedsartikler, og at de ikke vil betale for dem.

Zetland er ikke som de andre aviser. Der er ikke en forside med links til alle nypublicerede artikler. I stedet bruger Zetland Twitter til at lægge appetitvækkere ud.

Jeg syntes det var ærgerligt ikke at have Zetland med på Wallnot, så i stedet for at kigge efter links på forsiden, som Wallnot gør hos de andre medier, brugte jeg Twitters API til at trække artikellinks ud.

Her kan du se, hvordan jeg gjorde. Hvis du gerne vil prøve programmet af, skal du registrere dig som udvikler på Twitter.

# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
""" This program uses the Twitter API to get a list of free articles from Zetland """

import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import date
import json
from nested_lookup import nested_lookup
import base64


# GETS TWITTER DATA #

# Key and secret from Twitter developer account: https://developer.twitter.com/en/apply/user
client_key = ''
client_secret = ''

# Key and secret encoding, preparing for Twitter request
key_secret = '{}:{}'.format(client_key, client_secret).encode('ascii')
b64_encoded_key = base64.b64encode(key_secret)
b64_encoded_key = b64_encoded_key.decode('ascii')

base_url = 'https://api.twitter.com/'
auth_url = '{}oauth2/token'.format(base_url)

auth_headers = {
	'Authorization': 'Basic {}'.format(b64_encoded_key),
	'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}

auth_data = {
	'grant_type': 'client_credentials'
}

auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)
auth_resp.json().keys()
access_token = auth_resp.json()['access_token']

search_headers = {
	'Authorization': 'Bearer {}'.format(access_token)
}

# Search parameters for Zetland tweets
search_params = {
	'user_id': 452898921,
	'count': 35,
	'tweet_mode': 'extended',
	'exclude_replies': 'true',
	'trim_user': 'true'
}

# Request url for searching user timelines
search_url = '{}1.1/statuses/user_timeline.json'.format(base_url)

# Request to Twitter
search_resp = requests.get(search_url, headers=search_headers, params=search_params)

# Response from Twitter in json format
tweet_data = search_resp.json()
#prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json

# Looks for all instances of expanded_url (that is, links) in json	
linklist = list(set(nested_lookup('expanded_url', tweet_data)))

# Populates a list of links to Zetland articles
urllist = []
for link in linklist:
	if "zetland.dk/historie" in link:
		urllist.append(link)

		
# GETS ARTICLE DATA FROM ZETLAND #
		
# Requests articles and get titles and dates and sort by dates directly from Zetland site
articlelist = []
titlecheck = []

for url in urllist:
	try:
		data = requests.get(url)
		result = data.text

		# Soup site and create a dictionary of links and their titles and dates
		articledict = {}
		soup = BeautifulSoup(result, "lxml")

		title = soup.find('meta', attrs={'property':'og:title'})
		title = title['content']
		
		timestamp = soup.find('meta', attrs={'property':'article:published_time'})
		timestamp = timestamp['content']
		timestamp = timestamp[:timestamp.find("+")]
		dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
		
		if title not in titlecheck:
			articlelist.append({"title": title, "url": url, "date": dateofarticle})
			titlecheck.append(title)
	except:
		print(url)


# PREPARES LIST OF ARTICLES FOR WALLNOT.DK #

# Sort articles by date (newest first)		
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True) 

# Removes articles older than approximately three months
articlelist_recent = []
now = datetime.now()
for article in articlelist_sorted:
	timesincelast = now - article["date"]
	if timesincelast.days < 92:
		articlelist_recent.append(article)

# Converts dates to friendly format for display and outputs articles as html paragraphs
zet_linkstr = ""
for article in articlelist_recent:
	friendlydate = article["date"].strftime("%d/%m")
	zet_linkstr += '<p>' + friendlydate + ': ' + '<a href="' + article["url"] + '">' + article["title"] + '</a></p>\n' 

# Prints list of articles	
print(zet_linkstr)