Det er en json-fil på 3 MB! Jeg skrev et lille Python-program til at tygge den igennem:
import json
with open("privacy-manager-view.json", "r", encoding="utf8") as politiken:
politiken = json.load(politiken)
partners = []
for vendor in politiken['vendors']:
name = vendor['name']
url = vendor['policyUrl']
purposes = []
if 'consentCategories' in vendor:
for consent in vendor['consentCategories']:
if consent['type'] == "IAB_PURPOSE":
purposes.append(consent['name'])
if 'iabSpecialPurposes' in vendor:
for purpose in vendor['iabSpecialPurposes']:
purposes.append(purpose)
if 'iabFeatures' in vendor:
for purpose in vendor['iabFeatures']:
purposes.append(purpose)
if 'iabSpecialFeatures' in vendor:
for purpose in vendor['iabSpecialFeatures']:
purposes.append(purpose)
partners.append([name, url, purposes])
partners.sort(key=lambda x:x[0].lower())
number_of_partners = len(partners)
linklist = "<html lang='da'><body><h1>"
linklist += "Her er de " + str(number_of_partners) + " virksomheder, som overvåger dig, hvis du siger ja tak til alle cookies på politiken.dk (d. 11. december 2020)</h1><table>"
for partner in partners:
try:
linklist += "<tr><td><a href='" + partner[1] + "'>" + partner[0] + "</a></td></tr>\n"
except:
linklist += "<tr><td>" + partner[0] + "</td></tr>\n"
linklist += "</table></body></html>"
with open("linklist.html", "wt", encoding="utf8") as fout:
fout.write(linklist)
Opdatering: Jeg har lavet en ny, forbedret udgave af robottten. Læs om den her.
Både Zetland og Politiken har en feature, hvor abonnenter kan dele betalingsartikler med venner, bekendte og offentligheden. Artiklen får en unik URL, som låser op for betalingsmuren og lader alle og enhver læse artiklen.
Jeg tænkte at Wallnot – min hjemmeside med artikler, der ikke er bag betalingsmur – trængte til at være mere til stede på sociale medier.
Derfor har jeg lavet en robot, der gennemsøger Twitter for delte artikler fra Zetland og Politiken – og deler links’ne som tweets. Robotten opdater ca. på klokkeslettene 8.25, 12.25, 16.25 og 20.25. Det ville være skønt at kunne opdatere flere gange i døgnet, men så vil Twitter have penge.
# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
# THIS PROGRAM POSTS NEW SHARED ARTICLES FROM ZETLAND.DK AND POLITIKEN.DK TO TWITTER
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import time
from nested_lookup import nested_lookup
from TwitterAPI import TwitterAPI
articlestopost = []
# API LOGIN - INSERT YOUR OWN VALUES HERE
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)
# POLITIKEN.DK SEARCH #
SEARCH_TERM = 'url:"politiken.dk/del/"'
PRODUCT = '30day'
LABEL = 'prod'
r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL),
{'query':SEARCH_TERM})
tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json
# Looks for all instances of expanded_url in json
linklist = list(set(nested_lookup('expanded_url', tweet_data)))
urllist = []
for link in linklist:
if "politiken.dk/del" in link:
urllist.append(link)
# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []
for url in urllist:
try:
data = requests.get(url)
result = data.text
if '"isAccessibleForFree": "True"' not in result:
soup = BeautifulSoup(result, "lxml")
# Finds titles and timestamps
title = soup.find('meta', attrs={'property':'og:title'})
title = title['content']
timestamp = soup.find('meta', attrs={'property':'article:published_time'})
timestamp = timestamp['content']
dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z')
realurl = data.history[0].headers['Location']
if title not in titlecheck:
articlelist.append({"title": title, "url": realurl, "date": dateofarticle})
titlecheck.append(title)
except:
print(url)
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True)
# Check if article is already posted and update list of posted articles
with open("./pol_published.json", "r", encoding="utf8") as fin:
alreadypublished = list(json.load(fin))
for art in articlelist_sorted:
url = art['url']
token = url.index("?shareToken")
url = url[:token]
if url not in alreadypublished:
alreadypublished.append(url)
articlestopost.append(art)
# Save updated already published links
with open("./pol_published.json", "wt", encoding="utf8") as fout:
alreadypublishedjson = json.dumps(alreadypublished)
fout.write(alreadypublishedjson)
# ZETLAND.DK SEARCH #
SEARCH_TERM = 'url:"zetland.dk/historie"'
PRODUCT = '30day'
LABEL = 'prod'
r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL),
{'query':SEARCH_TERM})
tweet_data = json.loads(r.text)
prettyjson = json.dumps(tweet_data, ensure_ascii=False, indent=4) # Only needed for debugging to pretify json
# Looks for all instances of expanded_url in json
linklist = list(set(nested_lookup('expanded_url', tweet_data)))
urllist = []
for link in linklist:
if "zetland.dk/historie" in link:
urllist.append(link)
# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []
for url in urllist:
try:
data = requests.get(url)
result = data.text
# Soup site and create a dictionary of links and their titles and dates
articledict = {}
soup = BeautifulSoup(result, "lxml")
title = soup.find('meta', attrs={'property':'og:title'})
title = title['content']
timestamp = soup.find('meta', attrs={'property':'article:published_time'})
timestamp = timestamp['content']
timestamp = timestamp[:timestamp.find("+")]
dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
if title not in titlecheck:
articlelist.append({"title": title, "url": url, "date": dateofarticle})
titlecheck.append(title)
except:
print(url)
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'], reverse=True)
# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
alreadypublished = list(json.load(fin))
for art in articlelist_sorted:
title = art['title']
if title not in alreadypublished:
alreadypublished.append(title)
articlestopost.append(art)
# Save updated already published links
with open("./zet_published.json", "wt", encoding="utf8") as fout:
alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
fout.write(alreadypublishedjson)
# POST TO TWITTER #
if articlestopost:
for art in articlestopost:
if "zetland" in art['url']:
medium = "Zetland"
else:
medium = "Politiken"
status = "En flink abonnent på " + medium + " har delt en betalingsartikel. God fornøjelse! " + art['url']
r = api.request('statuses/update', {'status': status})
time.sleep(5)