Categories
blandet

Wallnots Twitter-bot: Version 2

Det er ikke mange dage siden, at Wallnot.dk‘s Twitter-bot gik i luften. Du kan finde botten her og mit indlæg om den her.

Robotten virkede sådan set fint nok, men pga. en begrænsning i Twitter’s API på 250 forespørgsler per måned, kunne jeg kun opdatere 4 gange i døgnet, og det er jo ret sjældent (det gamle program lavede 2 forespørgsler, hver gang det blev kørt, dvs. 30 dage * 4 opdateringer * 2 forespørgsler = 240 forespørgsler).

Heldigvis fandt jeg TWINT, et Python-modul der gør det nemt at hente data fra Twitter uden at gøre brug af Twitter’s API med dets kedelige begrænsninger.

Med genbrug af det meste af min gamle kode, har jeg nu lavet en version af robotten, der kan køre lige så tit, jeg har lyst til. Jeg har foreløbig sat den til at køre 4 gange i timen.

For sjov skyld har jeg også tilføjet en række venlige adjektiver om abonnenterne på Politiken og Zetland, som programmet vælger tilfældigt mellem, hver gang det lægger et link på Twitter.

Den færdige kode

Her er den færdige kode, hvis du er interesseret.

# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com

import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import date
from datetime import timedelta
import json
import time
import random
import twint	# https://github.com/twintproject/twint
from TwitterAPI import TwitterAPI

# CONFIGURATION #
# List to store articles to post to Twitter
articlestopost = []

# Yesterday's date variable
yesterday = date.today() - timedelta(days=1)
since = yesterday.strftime("%Y-%m-%d")

# Twint configuration
c = twint.Config()
c.Hide_output = True
c.Store_object = True
c.Since = since

# API LOGIN
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)


# POLITIKEN #
# Run search
searchterm = "politiken.dk/del"
c.Search = searchterm
twint.run.Search(c)
tweets = twint.output.tweets_object

# Add urls in tweets to list and remove any duplicates from list
urllist = []
for tweet in tweets:
	for url in tweet.urls:
		if searchterm in url:
			urllist.append(url)

urllist = list(set(urllist))

# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./pol_lastbatch.json", "r", encoding="utf8") as fin:
	lastbatch = list(json.load(fin))
	for url in urllist:
		if url not in lastbatch:
			proceslist.append(url)
# Save current query to use for next time
with open("./pol_lastbatch.json", "wt", encoding="utf8") as fout:
	lastbatch = json.dumps(urllist)
	fout.write(lastbatch)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in proceslist:
	try:
		data = requests.get(url)
		result = data.text
		if '"isAccessibleForFree": "True"' not in result:
			soup = BeautifulSoup(result, "lxml")
			# Finds titles and timestamps
			title = soup.find('meta', attrs={'property':'og:title'})
			title = title['content']
			timestamp = soup.find('meta', attrs={'property':'article:published_time'})
			timestamp = timestamp['content']
			dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z')
			realurl = data.history[0].headers['Location']
			if title not in titlecheck:
				articlelist.append({"title": title, "url": realurl, "date": dateofarticle})
				titlecheck.append(title)			
	except Exception as e:
		print(url)
		print(e)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date']) 

# Check if article is already posted and update list of posted articles
with open("./pol_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	# File below used for paywall.py to update wallnot.dk
	with open("./pol_full_share_links.json", "r", encoding="utf8") as finalready:	
		alreadypublishedalready = list(json.load(finalready))
		for art in articlelist_sorted:
			url = art['url']
			token = url.index("?shareToken")
			url = url[:token]
			if url not in alreadypublished:
				alreadypublished.append(url)
				articlestopost.append(art)
				alreadypublishedalready.append(art['url'])
		# Save updated already published links
		with open("./pol_published.json", "wt", encoding="utf8") as fout:
			alreadypublishedjson = json.dumps(alreadypublished)
			fout.write(alreadypublishedjson)
		with open("./pol_full_share_links.json", "wt", encoding="utf8") as fout:
			alreadypublishedjson = json.dumps(alreadypublishedalready)
			fout.write(alreadypublishedjson)


# ZETLAND #
# Run search
searchterm = "zetland.dk/historie"
c.Search = searchterm
twint.run.Search(c)
tweets = twint.output.tweets_object

# Add urls in tweets to list and remove any duplicates from list
urllist = []
for tweet in tweets:
	for url in tweet.urls:
		if searchterm in url:
			urllist.append(url)

urllist = list(set(urllist))

# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./zet_lastbatch.json", "r", encoding="utf8") as fin:
	lastbatch = list(json.load(fin))
	for url in urllist:
		if url not in lastbatch:
			proceslist.append(url)
# Save current query to use for next time
with open("./zet_lastbatch.json", "wt", encoding="utf8") as fout:
	lastbatch = json.dumps(urllist)
	fout.write(lastbatch)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in proceslist:
	try:
		data = requests.get(url)
		result = data.text
		soup = BeautifulSoup(result, "lxml")
		title = soup.find('meta', attrs={'property':'og:title'})
		title = title['content']
		timestamp = soup.find('meta', attrs={'property':'article:published_time'})
		timestamp = timestamp['content']
		timestamp = timestamp[:timestamp.find("+")]
		dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
		if title not in titlecheck:
			articlelist.append({"title": title, "url": url, "date": dateofarticle})
			titlecheck.append(title)
	except Exception as e:
		print(url)
		print(e)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date']) 

# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		title = art['title']
		if title not in alreadypublished:
			alreadypublished.append(title)
			articlestopost.append(art)
	# Save updated already published links
	with open("./zet_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
		fout.write(alreadypublishedjson)


# POST TO TWITTER #
friendlyterms = ["flink","rar","gavmild","velinformeret","intelligent","sød","afholdt","bedårende","betagende","folkekær","godhjertet","henrivende","smagfuld","tækkelig","hjertensgod","graciøs","galant","tiltalende","prægtig","kær","godartet","human","indtagende","fortryllende","nydelig","venlig","udsøgt","klog","kompetent","dygtig","ejegod","afholdt","omsorgsfuld","elskværdig","prægtig","skattet","feteret"]
enjoyterms = ["God fornøjelse!", "Nyd den!", "Enjoy!", "God læsning!", "Interessant!", "Spændende!", "Vidunderligt!", "Fantastisk!", "Velsignet!", "Glæd dig!", "Læs den!", "Godt arbejde!", "Wauv!"]

if articlestopost:
	for art in articlestopost:
		if "zetland" in art['url']:
			medium = "Zetland"
		else:
			medium = "Politiken"
		friendlyterm = random.choice(friendlyterms)
		enjoyterm = random.choice(enjoyterms)
		status = "En " + friendlyterm + " abonnent på " + medium + " har delt en artikel. " + enjoyterm + " " + art['url']
		r = api.request('statuses/update', {'status': status})
		time.sleep(15)
Categories
blandet

Wallnots nye Twitter-robot

Opdatering: Jeg har lavet en ny, forbedret udgave af robottten. Læs om den her.

Både Zetland og Politiken har en feature, hvor abonnenter kan dele betalingsartikler med venner, bekendte og offentligheden. Artiklen får en unik URL, som låser op for betalingsmuren og lader alle og enhver læse artiklen.

Jeg tænkte at Wallnot – min hjemmeside med artikler, der ikke er bag betalingsmur – trængte til at være mere til stede på sociale medier.

Derfor har jeg lavet en robot, der gennemsøger Twitter for delte artikler fra Zetland og Politiken – og deler links’ne som tweets. Robotten opdater ca. på klokkeslettene 8.25, 12.25, 16.25 og 20.25. Det ville være skønt at kunne opdatere flere gange i døgnet, men så vil Twitter have penge.

Du finder Wallnots nye Twitter-robot her: https://twitter.com/wallnot_dk

Sådan ser det ud, når Wallnots robot tweeter.

Jeg brugte Python og modulet TwitterAPI. Hvis du selv vil køre programmet, skal du lave en udviklerkonto og en app hos Twitter. Se
https://developer.twitter.com/.

Her er det færdige program.

# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
# THIS PROGRAM POSTS NEW SHARED ARTICLES FROM ZETLAND.DK AND POLITIKEN.DK TO TWITTER

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import time
from nested_lookup import nested_lookup
from TwitterAPI import TwitterAPI

articlestopost = []

# API LOGIN - INSERT YOUR OWN VALUES HERE
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)


# POLITIKEN.DK SEARCH #
SEARCH_TERM = 'url:"politiken.dk/del/"'
PRODUCT = '30day'
LABEL = 'prod'

r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), 
                {'query':SEARCH_TERM})

tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json

# Looks for all instances of expanded_url in json	
linklist = list(set(nested_lookup('expanded_url', tweet_data)))

urllist = []
for link in linklist:
	if "politiken.dk/del" in link:
		urllist.append(link)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in urllist:
	try:
		data = requests.get(url)
		result = data.text
		if '"isAccessibleForFree": "True"' not in result:
			soup = BeautifulSoup(result, "lxml")
			# Finds titles and timestamps
			title = soup.find('meta', attrs={'property':'og:title'})
			title = title['content']
			timestamp = soup.find('meta', attrs={'property':'article:published_time'})
			timestamp = timestamp['content']
			dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z')
			realurl = data.history[0].headers['Location']
			if title not in titlecheck:
				articlelist.append({"title": title, "url": realurl, "date": dateofarticle})
				titlecheck.append(title)			
	except:
		print(url)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True) 

# Check if article is already posted and update list of posted articles
with open("./pol_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		url = art['url']
		token = url.index("?shareToken")
		url = url[:token]
		if url not in alreadypublished:
			alreadypublished.append(url)
			articlestopost.append(art)
	# Save updated already published links
	with open("./pol_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished)
		fout.write(alreadypublishedjson)

# ZETLAND.DK SEARCH #
SEARCH_TERM = 'url:"zetland.dk/historie"'
PRODUCT = '30day'
LABEL = 'prod'

r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), 
                {'query':SEARCH_TERM})

tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json

# Looks for all instances of expanded_url in json	
linklist = list(set(nested_lookup('expanded_url', tweet_data)))

urllist = []
for link in linklist:
	if "zetland.dk/historie" in link:
		urllist.append(link)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

for url in urllist:
	try:
		data = requests.get(url)
		result = data.text

		# Soup site and create a dictionary of links and their titles and dates
		articledict = {}
		soup = BeautifulSoup(result, "lxml")

		title = soup.find('meta', attrs={'property':'og:title'})
		title = title['content']
		
		timestamp = soup.find('meta', attrs={'property':'article:published_time'})
		timestamp = timestamp['content']
		timestamp = timestamp[:timestamp.find("+")]
		dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
		
		if title not in titlecheck:
			articlelist.append({"title": title, "url": url, "date": dateofarticle})
			titlecheck.append(title)
	except:
		print(url)
			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True) 

# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		title = art['title']
		if title not in alreadypublished:
			alreadypublished.append(title)
			articlestopost.append(art)
	# Save updated already published links
	with open("./zet_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
		fout.write(alreadypublishedjson)


# POST TO TWITTER #
if articlestopost:
	for art in articlestopost:
		if "zetland" in art['url']:
			medium = "Zetland"
		else:
			medium = "Politiken"
		status = "En flink abonnent på " + medium + " har delt en betalingsartikel. God fornøjelse! " + art['url']
		r = api.request('statuses/update', {'status': status})
		time.sleep(5)