Wallnots Twitterbot, version 3

Wallnots Twitter-bot finder delte artikler fra Politiken og Zetland på Twitter og deler dem med verden. Det fungerer sådan her:

# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com

import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import date
from datetime import timedelta
import json
import time
import random
from TwitterAPI import TwitterAPI
from nested_lookup import nested_lookup

# CONFIGURATION #
# List to store articles to post to Twitter
articlestopost = []

# Search tweets from last 3 hours
now = datetime.utcnow()
since_hours = 3
since = now - timedelta(hours=since_hours)
since_string = since.strftime("%Y-%m-%dT%H:%M:%SZ")

# Search configuration
# https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent
# https://github.com/twitterdev/Twitter-API-v2-sample-code/tree/master/Recent-Search
tweet_fields = "tweet.fields=entities"
media_fields = "media.fields=url"
max_results = "max_results=100"
start_time = "start_time=" + since_string

# Twitter API login
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)

bearer_token = ''

# POLITIKEN #
# Run search
query = 'politiken.dk/del'

url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
	query, tweet_fields, media_fields, max_results, start_time
)
headers = {"Authorization": "Bearer {}".format(bearer_token)}
response = requests.request("GET", url, headers=headers)
json_response = response.json()

urllist = list(set(nested_lookup('expanded_url', json_response)))

# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./pol_lastbatch.json", "r", encoding="utf8") as fin:
	lastbatch = list(json.load(fin))
	for url in urllist:
		if url not in lastbatch and query in url:
			proceslist.append(url)
# Save current query to use for next time
with open("./pol_lastbatch.json", "wt", encoding="utf8") as fout:
	lastbatch = json.dumps(urllist)
	fout.write(lastbatch)

# Request articles and get titles and dates and sort by dates
articlelist = []

pol_therewasanerror = False
for url in proceslist:
	try:
		if 'https://www.google.com' in url:
			start = url.find('url=')+4
			end = url.find('&', start)
			url = url[start:end]	
		if not len(url) == 37:
			url = url[:37]
		data = requests.get(url)
		result = data.text
		if '"isAccessibleForFree": "True"' not in result:
			realurl = data.history[0].headers['Location']
			if not "/article" in realurl and not ".ece" in realurl:
				start_of_unique_id = realurl.index("/art")+1
				end_of_unique_id = realurl[start_of_unique_id:].index("/")
				unique_id = realurl[start_of_unique_id:start_of_unique_id+end_of_unique_id]
			elif "/article"	in realurl and ".ece" in realurl:
				start_of_unique_id = realurl.index("/article")+1
				end_of_unique_id = realurl[start_of_unique_id:].index(".ece")
				unique_id = realurl[start_of_unique_id:start_of_unique_id+end_of_unique_id]
			articlelist.append({"id": unique_id, "url": url})
	except Exception as e:
		print(url)
		print(e)
		pol_therewasanerror = True

#If something fails, we'll process everything again next time			
if pol_therewasanerror == True:
	with open("./pol_lastbatch.json", "wt", encoding="utf8") as fout:
		urllist = []
		lastbatch = json.dumps(urllist)
		fout.write(lastbatch)
	
# Check if article is already posted and update list of posted articles
with open("./pol_published_v2.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	# File below used for paywall.py to update wallnot.dk
	for article in articlelist:
		hasbeenpublished = False
		for published_article in alreadypublished:
			if article['id'] == published_article['id']:
				hasbeenpublished = True
				break
		if hasbeenpublished == False:
			alreadypublished.append(article)
			articlestopost.append(article)
	# Save updated already published links
	with open("./pol_published_v2.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished)
		fout.write(alreadypublishedjson)

# ZETLAND #
# Run search
query = 'zetland.dk/historie'

url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
	query, tweet_fields, media_fields, max_results, start_time
)
headers = {"Authorization": "Bearer {}".format(bearer_token)}
response = requests.request("GET", url, headers=headers)
json_response = response.json()

urllist = list(set(nested_lookup('expanded_url', json_response)))

# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./zet_lastbatch.json", "r", encoding="utf8") as fin:
	lastbatch = list(json.load(fin))
	for url in urllist:
		if url not in lastbatch and query in url:
			proceslist.append(url)
# Save current query to use for next time
with open("./zet_lastbatch.json", "wt", encoding="utf8") as fout:
	lastbatch = json.dumps(urllist)
	fout.write(lastbatch)

# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []

zet_therewasanerror = False
for url in proceslist:
	try:
		if 'https://www.google.com' in url:
			start = url.find('url=')+4
			end = url.find('&', start)
			url = url[start:end]		
		data = requests.get(url)
		result = data.text
		soup = BeautifulSoup(result, "lxml")
		title = soup.find('meta', attrs={'property':'og:title'})
		title = title['content']
		timestamp = soup.find('meta', attrs={'property':'article:published_time'})
		timestamp = timestamp['content']
		timestamp = timestamp[:timestamp.find("+")]
		dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
		if title not in titlecheck:
			articlelist.append({"title": title, "url": url, "date": dateofarticle})
			titlecheck.append(title)
	except Exception as e:
		print(url)
		print(e)
		zet_therewasanerror = True

#If something fails, we'll process everything again next time
if zet_therewasanerror == True:
	with open("./zet_lastbatch.json", "wt", encoding="utf8") as fout:
		urllist = []
		lastbatch = json.dumps(urllist)
		fout.write(lastbatch)


			
articlelist_sorted = sorted(articlelist, key=lambda k: k['date']) 

# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
	alreadypublished = list(json.load(fin))
	for art in articlelist_sorted:
		title = art['title']
		if title not in alreadypublished:
			alreadypublished.append(title)
			articlestopost.append(art)
	# Save updated already published links
	with open("./zet_published.json", "wt", encoding="utf8") as fout:
		alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
		fout.write(alreadypublishedjson)


# POST TO TWITTER AND FACEBOOK#
friendlyterms = ["flink","rar","gavmild","velinformeret","intelligent","sød","afholdt","bedårende","betagende","folkekær","godhjertet","henrivende","smagfuld","tækkelig","hjertensgod","graciøs","galant","tiltalende","prægtig","kær","godartet","human","indtagende","fortryllende","nydelig","venlig","udsøgt","klog","kompetent","dygtig","ejegod","afholdt","omsorgsfuld","elskværdig","prægtig","skattet","feteret"]
enjoyterms = ["God fornøjelse!", "Nyd den!", "Enjoy!", "God læsning!", "Interessant!", "Spændende!", "Vidunderligt!", "Fantastisk!", "Velsignet!", "Glæd dig!", "Læs den!", "Godt arbejde!", "Wauv!"]

if articlestopost:
	for art in articlestopost:
		if "zetland" in art['url']:
			medium = "@ZetlandMagasin"
		else:
			medium = "@politiken"
		friendlyterm = random.choice(friendlyterms)
		enjoyterm = random.choice(enjoyterms)
		status = "En " + friendlyterm + " abonnent på " + medium + " har delt en artikel. " + enjoyterm
		twitterstatus = status + " " + art['url']
		try:
			twitterupdate = api.request('statuses/update', {'status': twitterstatus})
		except Exception as e:
			print(e)
		time.sleep(15)