Jeg er ved at bygge min egen aula-klient:
Tag: django
Til kamp mod phishing på lnk.dk: del 2
Mit første naive forsøg på at forhindre kriminelles brug af lnk.dk til at pege på diverse phishing-sider lykkedes, mildt sagt, ikke.
Nu har jeg taget skrappere midler i brug.
I min models.py i min Django-applikation, tilføjer jeg et felt til at gemme ip-adresse på den, der har oprettet et link, og et felt til at markere, om et link er usikkert:
class Link(models.Model):
# Short link is only lower case
def save(self, *args, **kwargs):
self.shortlink = self.shortlink.lower()
return super(Link, self).save(*args, **kwargs)
destination = models.URLField('Destinationslink', max_length=65535, validators=[validate_destination])
shortlink = models.SlugField('Kort link', max_length=100, unique=True, allow_unicode=False, validators=[validate_shortlink])
LINK_TYPE_CHOICES = (
('automatic', 'Automatisk'),
('manual', 'Manuelt'),
)
submitter_ip = models.GenericIPAddressField(null=True)
unsafe_link = models.BooleanField(default=False)
type = models.CharField('Type', max_length=10, choices=LINK_TYPE_CHOICES)
date = models.DateTimeField(default=timezone.now, editable=False)
Derudover tilføjer jeg tabeller til at kunne blokere for ip-adresser og domæner, der ikke skal kunne oprette brugbare links:
class Ban(models.Model):
banned_ip = models.GenericIPAddressField(unique=True)
class BanDomain(models.Model):
banned_domain = models.CharField(max_length=255, unique=True)
Med det på plads tilpasser jeg min logik til oprettelse af links i views.py sådan, at:
- kun brugere med ip-adresser kan oprette links,
- links bliver tjekket med Google Safe Browsing (efter råd på Twitter)
- links til domæner og fra ip-adresser, der er blokeret, bliver automatisk markeret som usikre
Her er funktionen til at tjekke links op mod Google Safe Browsing:
# Google safe browsing API check function
def is_url_google_safe_browsing_safe(url):
params = {
'key': ''
}
json = {
"client": {
"clientId": "lnk.dk",
"clientVersion": "1.0"
},
"threatInfo": {
"threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE"],
"platformTypes": ["ANY_PLATFORM"],
"threatEntryTypes": ["URL"],
"threatEntries": [
{"url": url}
]
}
}
api_url = 'https://safebrowsing.googleapis.com/v4/threatMatches:find'
try:
response_json = requests.post(api_url, params=params, json=json).json()
if response_json:
return True
else:
return False
# If something unexpected is returned from Google, link creation is allowed
except:
return False
Og her er min nye logik til at tjekke modtagne links. Læg mærke til, at brugere uden ip-adresse automatisk bliver rickrolled
if form.is_valid():
client_ip, is_routable = get_client_ip(request)
# Hiding your IP seems illegit, so user is rickrolled
if client_ip is None:
return HttpResponseRedirect('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
else:
destination = form.cleaned_data['destination'] # Submitted destination
shortlink = form.cleaned_data['shortlink'] # Submitted slug
# Google Safe Browsing check
unsafe_url = is_url_google_safe_browsing_safe(destination)
# Ban domain check
domain_info = extract(destination)
domain = domain_info.domain + '.' + domain_info.suffix
domain_ban = BanDomain.objects.filter(banned_domain=domain)
if len(domain_ban) > 0:
banned_domain = True
else:
banned_domain = False
# Ban ip check
ip_ban = Ban.objects.filter(banned_ip=client_ip)
if len(ip_ban) > 0:
banned_ip = True
else:
banned_ip = False
Til sidst har jeg forsøgt at narre phisherne ved at links til usikre sider virker, for den, der selv har oprettet linket. Alle andre bliver rickrollet, hvis de klikker på et usikkert link:
# Short link redirect to destination URL
def redirect(request, shortlink):
# Query the database for short link, if there is a hit, redirect to destination URL
# In case of uppercase characters in user input shortlink, the link is made lowercase
# Also, check if ip is banned
try:
client_ip, is_routable = get_client_ip(request)
link = Link.objects.get(shortlink=shortlink.lower())
# Legit users are rickrolled when an unsafe link is visited from an IP that is not banned
if link.unsafe_link == True and not client_ip == link.submitter_ip:
return HttpResponseRedirect('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
# For safe links AND for unsafe links visited from banned IPs, user is redirected to destination
else:
# If there's a referer and it's the same as the destination link, show a 404 to avoid an endless loop
if 'HTTP_REFERER' in request.META and link.destination == request.META['HTTP_REFERER']:
return render(request, 'links/404.html', status=404)
return HttpResponseRedirect(link.destination)
# In case of an error, show 404 page
except:
return render(request, 'links/404.html', status=404)
Til kamp mod phishing på lnk.dk
Der er desværre nogle kriminelle, der har opdaget min kortlink-service lnk.dk og bruger siden til at lave korte links, der peger på forskellige phishing-formularer. De fleste på fransk, enkelte på dansk.
Jeg vil helst kun have, at min side bruges til lovlige formål, og derfor har jeg i første omgang lavet et kontrolspørgsmål i formularen til at oprette links. Jeg håber, at det kun er ærlige mennesker, der kan svare på spørgsmålet, og at det er relativt nemt for dem:

For at implementere det nye felt, redigerede jeg min Django-applikations forms.py med feltet og krav til validering:
from django.forms import ModelForm
from django import forms
from .models import Link
from django.core.exceptions import ValidationError
class LinkForm(ModelForm):
everyoneknows = forms.CharField(label='Hvad er fornavnet på cykelrytteren, der vandt Tour de France for mænd i 2022?', error_messages={'required': 'Indtast cykelrytterens fornavn'})
def clean_everyoneknows(self):
answer = self.cleaned_data['everyoneknows'].lower()
if answer != 'jonas':
raise ValidationError("Det fornavn, du har indtastet, er forkert.")
return answer
def __init__(self, *args, **kwargs):
super(LinkForm, self).__init__(*args, **kwargs)
self.fields['destination'].widget.attrs['placeholder'] = 'https://eksempel.dk/meget/lang/url'
self.fields['shortlink'].widget.attrs['placeholder'] = 'eksempel'
self.fields['shortlink'].label_suffix = "" # Remove colon after label
self.fields['shortlink'].required = False # Not required in form
def clean_shortlink(self):
shortlink = self.cleaned_data['shortlink']
return shortlink.lower()
class Meta:
model = Link
fields = ['destination', 'shortlink']
labels = {
'shortlink': ('Evt. selvvalgt kort link:'),
}
error_messages = {
'destination': {
'max_length': ('Din destinationsurl er for lang til denne kortlinkservice.'),
'invalid': ('Din destinationsurl er ikke en gyldig adresse. Husk http://, https:// eller ftp:// foran dit link, hvis du har glemt det.'),
},
'shortlink': {
'unique': ('Det selvvalgte link, du har valgt, er allerede i brug. Find på et andet.'),
'max_length': ('Dit selvvalgte link må maksimalt være 100 tegn langt.'),
'invalid': ('Du kan kun bruge bogstaver (dog ikke æ, ø, å - kun ASCII-tegnsættet), tal, bindestreg og understreg i din selvvalgte adresse.'),
}
}
Det bliver spændende at se, om ændringen har nogen effekt.
Folkets wallnot.dk
En bruger på wallnot.dk skrev til mig og foreslog at lade brugerne på siden vurdere kvaliteten af de artikler, siden linker til, ligesom på fx Hacker News. Idéen er at gode artikler så kan ligge øverst, mens metervaren synker ned i bunden – hvis altså folket har forstand på at vurdere den slags.
Jeg forsøgte at lave en sådan løsning, og den kan du nu prøve af.
Dynamik med JavaScript
For at gøre det helt smart og dynamisk, havde jeg brug for noget JavaScript, der kan fyre en stemme af sted, så snart en bruger klikker på ▲ eller ▼.
Jeg er ikke helt ferm til JavaScript, men jeg begynder at forstå det, og med god hjælp og lidt copy/paste fra forskellige kilder, landede jeg til sidst på noget kode, der ser ud til at virke.
Den første del henter en såkaldt CRSF-cookie, der sørger for, at man er nødt til at besøge Wallnot, inden man kan stemme på artikler, og at man ikke kan stemme på vegne af andre fra andre hjemmesider.
Den anden del sender en forespørgsel af sted med cookie-værdien og selve stemmen og opdaterer stemmeantallet på siden, når forespørgslen er behandlet.
function getCookie(name) {
let cookieValue = null;
if (document.cookie && document.cookie !== '') {
const cookies = document.cookie.split(';');
for (let i = 0; i < cookies.length; i++) {
const cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
}
return cookieValue;
}
const csrftoken = getCookie('csrftoken');
document.querySelectorAll('.vote').forEach(function(el){
el.addEventListener('click', function() {
article_votes_id = this.id.substring(0, this.id.indexOf('_')) + '_votes';
votes_to_replace = document.getElementById(article_votes_id)
fetch('/process_vote', {
method: "POST",
headers: {
"X-CSRFToken": csrftoken,
},
body: JSON.stringify({
vote: this.id
})
}).then(function (response) {
return response.json();
})
.then(function (data) {
votes_to_replace.innerHTML = data.votes;
})
.catch(function (err) {
console.log(err);
});
});
});
Behandling af forespørgslen
Forespørgslen sender et artikel-id af sted sammen med information om der er tale om en ▲-stemme eller en ▼-stemme.
I Djangos views.py skriver jeg en funktion, der kan modtage forespørgslen og returnerer stemmeantallet efter forespørgslen er behandlet. Funktionen sender JSON-data tilbage til mit JavaScript, hvis (og kun hvis) stemmen har et eksisterende artikel-id efterfulgt af enten “_up” eller “_down”. For alt andet svarer funktionen tilbage, at den er en tepotte og derfor ikke kan hjælpe:
def process_vote(request):
if request.method == "POST":
try:
vote = json.loads(request.body.decode())['vote']
article_id = vote[:vote.index('_')]
article = Article.objects.get(id=article_id)
if '_up' in vote:
article.votes += 1
elif '_down' in vote:
article.votes -= 1
else:
return HttpResponse(status=418)
article.save()
votes = {'votes': article.votes}
return JsonResponse(votes)
except:
return HttpResponse(status=418)
return HttpResponse(status=418)
En sorteringsalgoritme
Som det allersidste havde jeg brug for at udvikle en sorteringsalgoritme, der tog højde for artiklers alder, som jeg kunne bruge i mit view. Den tog lidt tid at skrive, fordi det nogle gange kan være svært at regne ud, hvordan man med Djangos databaseforespørgselssyntaks kan lave de beregninger, man har brug for, direkte med forespørgslen til databasen.
Algoritmen gør sådan her:
- Tager antal stemmer og lægger 1 til. Hvis alle artikler starter på 1, forhindrer jeg at artikler med et positivt antal stemmer altid vil ligge over artikler uden stemmer overhovedet.
- Deler dette tal med 1 plus antal timer siden artiklens offentliggørelsestidspunkt.
- Antal timer udregnes ved at tage antal dage siden offentliggørelsestidspunktet og gange med 24 og dertil lægge det yderligere antal timer fra det samlede interval i dage og timer siden offentliggørelsestidspunktet.
- For at undgå at komme til at dele med 0, lægger jeg 1 til antal timer og tager den absolutte værdi af antal timer siden offentliggørelsestidspunktet. Det er nødvendigt, fordi medierne engang imellem offentliggør artikler med et publiceringstidspunkt i fremtiden.
- Fordi jeg deler stemmer med antal timer siden offentliggørelse, vil en nyhed hurtigt miste sin “værdi”. Hvis Folkets Wallnot ikke bliver en kæmpe succes, kan det være at jeg skal dele med antal dage i stedet, sådan “straffen” for at være en gammel artikel ikke bliver ligeså mærkbar.
Her er algoritmen skrevet som forespørgsel i Django:
articles = Article.objects.filter(paywall_detected=False)
.annotate(score=ExpressionWrapper((F('votes') + 1) /
(1+Abs(ExtractDay(Now()-F('date'))*24 + ExtractHour(Now()-F('date')))),output_field=FloatField()))
.order_by('-score','-date')
Min egen private eReolen
Jeg kan godt lide at læse bøger, og mange af dem låner jeg på eReolen. Men for en bognørd er eReolen ikke særlig brugervenlig. Der er godt nok en masse mærkelige søgninger, man kan lave, hvis man er teknisk nok, men sådan noget som at se, hvad der rent faktisk er nytilføjede bøger, er svært at følge med i.
eReolen har godt nok en sektion, de kalder “nyheder”, med en søgning der i skrivende stund (februar 2022) hedder noget i retning af:
(dkcclterm.op=202112* OR dkcclterm.op=202201*) AND term.type=ebog and facet.category=voksenmaterialer
Slår man op i beskrivelsen af brøndindekser, kan man se at “dkcclterm.op” dækker over:
dkcclterm.op | op | Oprettelsesdato |
Men hvordan kan det være, at en visning af nyheder søger på oprettelsesdatoer i december og januar? Det er februar nu.
Fordi “Oprettelsesdato” for en titel ikke er det samme som dato for titlens tilføjelse på eReolen. Hvad det betyder, ved jeg ikke med sikkerhed, men i hvert fald ikke titlens tilføjelse på eReolen.
Og det betyder, at der løbende kan dukke spændende bøger op, hvis “dkcclterm.op”-værdi ligger langt tilbage i tiden.
Og det betyder, at jeg kan risikere at misse noget, jeg gerne vil læse.
Hvad gjorde jeg så?
Jeg byggede min egen eReolen! Med en robot, der hver nat monitorerer, hvilke titler der rent faktisk er nye. Hver morgen ligger der en mail til mig om, hvor mange titler robotten har fundet, og hvis jeg har tid og kaffe til det, kan jeg kigge de nye titler igennem over morgenkaffen.
Det fungerer sådan her:
I Django byggede jeg en datamodel over titler med forskellige metadata:
from django.db import models
from isbn_field import ISBNField
class Author(models.Model):
full_name = models.CharField('Forfatter', max_length=200, unique=True)
birth_year = models.DateField(null=True)
def __str__(self):
return self.full_name
class Publisher(models.Model):
publisher = models.CharField('Udgiver', max_length=200, unique=True)
def __str__(self):
return self.publisher
class Keyword(models.Model):
keyword = models.CharField('Nøgleord', max_length=200, unique=True)
def __str__(self):
return self.keyword
class TitleType(models.Model):
title_type = models.CharField('Type', max_length=200, unique=True)
def __str__(self):
return self.title_type
class Language(models.Model):
language = models.CharField('Sprog', max_length=50, unique=True)
def __str__(self):
return self.language
class Isbn(models.Model):
isbn = ISBNField(null=True, blank=True)
def __str__(self):
return self.isbn
class Audience(models.Model):
audience = models.CharField('Målgruppe', max_length=200, unique=True)
def __str__(self):
return self.audience
class TitleFormat(models.Model):
title_format = models.CharField('Format', max_length=50, unique=True)
def __str__(self):
return self.title_format
class Title(models.Model):
added = models.DateField()
object_id = models.CharField('Ereolen-id', max_length=50, unique=True)
title = models.CharField('Titel', max_length=500)
original_title = models.CharField('Originaltitel', max_length=500, default="")
publish_date = models.DateField(null=True)
dk5 = models.CharField('DK5-kode', max_length=10, default="")
cover_url = models.URLField('Cover-url', max_length=500, null=True)
ereolen_url = models.URLField('Ereolen-url', max_length=500)
abstract = models.TextField(blank=True)
dkcclterm_op = models.DateField()
publisher = models.ForeignKey(Publisher, on_delete=models.CASCADE)
language = models.ForeignKey(Language, on_delete=models.CASCADE)
title_type = models.ForeignKey(TitleType, on_delete=models.CASCADE)
title_format = models.ForeignKey(TitleFormat, on_delete=models.CASCADE)
author = models.ManyToManyField(Author)
keyword = models.ManyToManyField(Keyword)
audience = models.ManyToManyField(Audience)
isbn = models.ManyToManyField(Isbn)
def __str__(self):
return self.title
def get_authors(self):
return " & ".join([author.full_name for author in self.author.all()])
get_authors.short_description = "Author(s)"
def get_isbns(self):
return ", ".join([isbn.isbn for isbn in self.isbn.all()])
get_isbns.short_description = "ISBN(s)"
def get_keywords(self):
return ", ".join([keyword.keyword for keyword in self.keyword.all()])
get_keywords.short_description = "Keyword(s)"
def get_audiences(self):
return ", ".join([audience.audience for audience in self.audience.all()])
get_audiences.short_description = "Audience(s)"
I Python skrev jeg en robot, der søger eReolen igennem, tilføjer nye titler til min database og ignorerer titler, der allerede er i databasen. Robotten satte jeg op til at køre hver nat på min server:
# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
""" This program saves ebooks, audiobooks and podcasts from ereolen.dk to a local database
that can be used to detect new titles better than ereolen.dk's own search options """
import requests # make http requests
from bs4 import BeautifulSoup # parse html responses
from datetime import date # create date objects
from dateutil.relativedelta import relativedelta # adding and subtracting months to dates
import re # regex for publish year parsing
import psycopg2 # work with postgresql databases
from psycopg2 import Error # database error handling
# Connect to database
try:
connection = psycopg2.connect(user = "",
password = "",
host = "",
port = "",
database = "")
cursor = connection.cursor()
except (Exception, psycopg2.Error) as error:
print("Error while connecting to PostgreSQL", error)
# Set configuration options and global variables
base_url = 'https://ereolen.dk'
term_types = ['ebog','lydbog','podcast']
added = date.today()
number_of_months_to_search = 200
start_month = added - relativedelta(months=number_of_months_to_search-2)
# Search period list goes from current month plus one month and back to start_month
search_period = []
for i in reversed(range(0,number_of_months_to_search)):
year_month_date = start_month + relativedelta(months=+i)
year_month = [year_month_date.year, year_month_date.month]
search_period.append(year_month)
# Crawl loop
title_counter = 0
for year_month in search_period:
for term_type in term_types:
start_date = date(year_month[0],year_month[1],1)
dkcclterm_op_search = start_date.strftime("%Y%m")
page = 0
pages_left = True
while pages_left == True:
# Search for hits
search_url = base_url + '/search/ting/dkcclterm.op%3D' + dkcclterm_op_search + '*%20AND%20term.type%3D' + term_type + '?page=' + str(page) + '&sort=date_descending'
request = requests.get(search_url)
result = request.text
# If an error message is returned in the search, either no results are left, or ereolen.dk is down for some reason
# In this case, the while loop is broken to try next item type and/or next year-month combination
if 'Vi kan desværre ikke finde noget, der matcher din søgning' in result or 'The website encountered an unexpected error. Please try again later.' in result:
pages_left = False
break
# Parse hits and get all item links
soup = BeautifulSoup(result, "lxml")
links = soup.find_all('a', href=True)
item_links = {link['href'] for link in links if "/ting/collection/" in link['href']}
# Go through item link
for link in item_links:
# Get id and check if link is already in databse
object_id = link[link.rfind('/')+1:].replace('%3A',':')
search_sql = '''SELECT * from ereolen_title WHERE object_id = %s'''
cursor.execute(search_sql, (object_id, ))
item_hit = cursor.fetchone()
# No hits means item is not in database and should be added
if not item_hit:
### ADD SEQUENCE ###
# Set full url for item
ereolen_url = base_url + link
# Request item and parse html
title_request = requests.get(ereolen_url)
title_result = title_request.text
title_soup = BeautifulSoup(title_result, "lxml")
# TITLE FIELDS #
# TITLE
try:
title = title_soup.find('div', attrs={'class':'field-name-ting-title'}).text.replace(" : ",": ")
except:
print("Ingen titel på:", ereolen_url)
break
# ORIGINAL TITLE
try:
original_title = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Original titel:")).next.next.text
except:
original_title = ''
# PUBLISHED
try:
published = title_soup.find('div', class_={"field-name-ting-author"}).get_text()
published = int(re.search("[(]\d\d\d\d[)]", published).group()[1:5])
publish_date = date(published,1,1)
except:
publish_date = None
# COVER URL
try:
cover_url = title_soup.find('div', class_={"ting-cover"}).img['src']
except:
try:
data = {
'coverData[0][id]': object_id,
'coverData[0][image_style]': 'ding_primary_large'
}
response = requests.post('https://ereolen.dk/ting/covers', data=data)
response_json = response.json()
cover_url = response_json[0]['url']
except:
cover_url = ''
# ABSTRACT
abstract = title_soup.find('div', attrs={'class':'field-name-ting-abstract'}).text
# DKCCLTERM_OP
dkcclterm_op = start_date
# FOREIGN KEY FIELDS #
# LANGUAGE
try:
ereolen_language = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Sprog:")).next.next.text
except:
ereolen_language = 'Ukendt'
language_sql = '''SELECT * from ereolen_language WHERE language = %s'''
cursor.execute(language_sql, (ereolen_language, ))
try:
language = cursor.fetchone()[0]
except:
language_insert = '''INSERT INTO ereolen_language(language) VALUES(%s) RETURNING id'''
cursor.execute(language_insert, (ereolen_language, ))
language = cursor.fetchone()[0]
# PUBLISHER
try:
ereolen_publisher = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Forlag:")).next.next.text
except:
ereolen_publisher = 'Ukendt'
publisher_sql = '''SELECT * from ereolen_publisher WHERE publisher = %s'''
cursor.execute(publisher_sql, (ereolen_publisher, ))
try:
publisher = cursor.fetchone()[0]
except:
publisher_insert = '''INSERT INTO ereolen_publisher(publisher) VALUES(%s) RETURNING id'''
cursor.execute(publisher_insert, (ereolen_publisher, ))
publisher = cursor.fetchone()[0]
# TYPE
try:
ereolen_type = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Type:")).next.next.text
except:
ereolen_type = 'Ukendt'
type_sql = '''SELECT * from ereolen_titletype WHERE title_type = %s'''
cursor.execute(type_sql, (ereolen_type, ))
try:
title_type = cursor.fetchone()[0]
except:
title_type_insert = '''INSERT INTO ereolen_titletype(title_type) VALUES(%s) RETURNING id'''
cursor.execute(title_type_insert, (ereolen_type, ))
title_type = cursor.fetchone()[0]
# FORMAT
try:
ereolen_format = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Ebogsformat:")).next.next.text
except:
ereolen_format = "Ukendt"
format_sql = '''SELECT * from ereolen_titleformat WHERE title_format = %s'''
cursor.execute(format_sql, (ereolen_format, ))
try:
title_format = cursor.fetchone()[0]
except:
title_format_insert = '''INSERT INTO ereolen_titleformat(title_format) VALUES(%s) RETURNING id'''
cursor.execute(title_format_insert, (ereolen_format, ))
title_format = cursor.fetchone()[0]
# DK5 - TODO: Not done yet
dk5 = ""
### SAVE BEFORE ADDING MANY-TO-MANY FIELDS ###
title_data = (added,title_type,title,original_title,publisher,object_id,language,publish_date,cover_url,ereolen_url,title_format,abstract,dkcclterm_op,dk5)
title_insert = '''INSERT INTO ereolen_title(added,title_type_id,title,original_title,publisher_id,object_id,language_id,publish_date,cover_url,ereolen_url,title_format_id,abstract,dkcclterm_op,dk5) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id'''
cursor.execute(title_insert, title_data)
title_id = cursor.fetchone()[0]
connection.commit()
# MANY-TO-MANY FIELDS #
# AUDIENCE(S)
try:
audience_div = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("Målgruppe:")).next.next
audiences = audience_div.find_all('span')
audiences_list = [aud.text for aud in audiences]
except:
audiences_list = ['Ukendt']
for audience in audiences_list:
audience_sql = '''SELECT * from ereolen_audience WHERE audience = %s'''
cursor.execute(audience_sql, (audience, ))
try:
audience_id = cursor.fetchone()[0]
except:
audience_insert = '''INSERT INTO ereolen_audience(audience) VALUES(%s) RETURNING id'''
cursor.execute(audience_insert, (audience, ))
audience_id = cursor.fetchone()[0]
audience_relation_sql = '''INSERT INTO ereolen_title_audience (title_id, audience_id) VALUES (%s,%s)'''
try:
cursor.execute(audience_relation_sql, (title_id,audience_id))
except:
connection.rollback()
# ISBN(S)
try:
isbn_div = title_soup.find('div', attrs={'class':'field-label'}, string=re.compile("ISBN:")).next.next
isbns = isbn_div.find_all('span')
isbns_list = [isb.text for isb in isbns]
for isbn in isbns_list:
isbn_sql = '''SELECT * from ereolen_isbn WHERE isbn = %s'''
cursor.execute(isbn_sql, (isbn, ))
try:
isbn_id = cursor.fetchone()[0]
except:
isbn_insert = '''INSERT INTO ereolen_isbn(isbn) VALUES(%s) RETURNING id'''
cursor.execute(isbn_insert, (isbn, ))
isbn_id = cursor.fetchone()[0]
isbn_relation_sql = '''INSERT INTO ereolen_title_isbn (title_id, isbn_id) VALUES (%s,%s)'''
try:
cursor.execute(isbn_relation_sql, (title_id,isbn_id))
except:
connection.rollback()
except:
pass
# KEYWORDS(S)
keywords_div = title_soup.find('div', attrs={'class':'field-name-ting-subjects'})
if keywords_div:
keywords = [link.text for link in keywords_div.find_all('a')]
for keyword in keywords:
keyword_sql = '''SELECT * from ereolen_keyword WHERE keyword = %s'''
cursor.execute(keyword_sql, (keyword, ))
try:
keyword_id = cursor.fetchone()[0]
except:
keyword_insert = '''INSERT INTO ereolen_keyword(keyword) VALUES(%s) RETURNING id'''
cursor.execute(keyword_insert, (keyword, ))
keyword_id = cursor.fetchone()[0]
keyword_relation_sql = '''INSERT INTO ereolen_title_keyword (title_id, keyword_id) VALUES (%s,%s)'''
try:
cursor.execute(keyword_relation_sql, (title_id,keyword_id))
except:
connection.rollback()
# AUTHOR(S)
creator_full = title_soup.find('div', attrs={'class':'field-name-ting-author'}).text.replace("Af ","")
# Remove date of book
creator = creator_full[:creator_full.rfind("(")-1]
authors = creator.split(",")
for author in authors:
birth_year = None
if ' (f. ' in author and not len(author) < 7:
if 'ca. ' in author:
author = author.replace('ca. ','')
birth_year_string = author[author.index("(f.")+4:author.index("(f.")+8]
if ')' in birth_year_string:
birth_year_string = birth_year_string.replace(')','')
birth_year = date(int(birth_year_string),1,1)
author = author[:author.index(" (f.")]
elif ' (f. ' in author:
breakpoint()
# Some times there are no authors, but still a published year
if len(author) == 5 and "(" in author:
author = ""
if author:
author = author.strip()
author_sql = '''SELECT * from ereolen_author WHERE full_name = %s'''
cursor.execute(author_sql, (author, ))
try:
author_id = cursor.fetchone()[0]
except:
if birth_year:
author_insert = '''INSERT INTO ereolen_author(full_name,birth_year) VALUES(%s,%s) RETURNING id'''
cursor.execute(author_insert, (author,birth_year))
else:
author_insert = '''INSERT INTO ereolen_author(full_name) VALUES(%s) RETURNING id'''
cursor.execute(author_insert, (author, ))
author_id = cursor.fetchone()[0]
author_relation_sql = '''INSERT INTO ereolen_title_author (title_id, author_id) VALUES (%s,%s)'''
try:
cursor.execute(author_relation_sql, (title_id,author_id))
except:
connection.rollback()
### SAVE ###
connection.commit()
title_counter += 1
page += 1
connection.close()
print('Ereolen crawl ran')
if title_counter > 0:
print('Added titles on ereolen:', title_counter)
Og i Djangos indbyggede administrationsinterface, kan jeg med fint overblik og gode søgnings-, sorterings- og filtreringsmuligheder få øje på en novellesamling af Georg Metz, der netop er dukket op i eReolen med en “dkcclterm.op”-værdi fra september 2013!

Må jeg prøve?
Jeg ville gerne dele mit værktøj med andre, men det er ikke helt lige til at afklare, hvilke dele af eReolens bogdata, der er frie og offentlige, og hvilke der ejes af en (i mine øjne) lidt underlig konstruktion, der hedder DBC. Et KL-ejet firma (Kommunernes Landsforening), der tjener penge på at sælge data om bøger til – kommuner (og nogle andre aktører, som jeg gætter på næsten udelukkende er offentlige).
Jeg er ved at undersøge, hvad jeg kan offentliggøre uden at genere nogen eller bryde ophavsretsloven. Det kan godt være, det tager lidt tid.
Borgerforslag i version 1
Jeg har endelig haft til til at vise mine nyindsamlede data fra borgerforslag.dk frem for verden.
Jeg har tidligere skrevet om, hvordan jeg har sat en robot til at indhente støttedata om borgerforslag, og hvordan jeg optimerede indsamlingen af data.
Nu har jeg bygget første udgave af en Django-app, der fremviser mine skønne data.
Du kan prøve den af på wallnot.dk/borgerforslag.
Koden bag består af to views i Django. En forside (index), der viser en tabel med en liste over borgerforslag, og en side (forslag), der viser data om det enkelte borgerforslag:
def index(request):
context = {}
today = date.today()
context['today'] = today
all_suggestions = Suggestion.objects.annotate(votes=Max('vote__votes'))
# Sort logic
sort = request.GET.get('sort')
if sort == "-date" or not sort:
all_suggestions = all_suggestions.order_by('-suggested_date')
elif sort == "date":
all_suggestions = all_suggestions.order_by('suggested_date')
elif sort == "title":
all_suggestions = all_suggestions.order_by('title')
elif sort == "-title":
all_suggestions = all_suggestions.order_by('-title')
elif sort == "votes":
all_suggestions = all_suggestions.order_by('votes')
elif sort == "-votes":
all_suggestions = all_suggestions.order_by('-votes')
context['all_suggestions'] = all_suggestions
return render(request, 'borgerforslag/index.html', context)
def forslag(request, id):
context = {}
today = date.today()
start_of_data_collection_date = date(2021, 10, 19)
context['start_of_data_collection_date'] = start_of_data_collection_date
# Votes per day is used to display the aggregated number of votes per day
votes_per_day = Suggestion.objects.filter(pk=id).annotate(date=TruncDate('vote__timestamp')).order_by('id', 'date').annotate(number_of_votes=Max('vote__votes'))
context['votes_per_day'] = votes_per_day
suggestion = Suggestion.objects.get(pk=id)
context['suggestion'] = suggestion
votes = suggestion.vote_set
# The number of votes per day since suggestion was made
votes_max = votes.aggregate(Max('votes'))['votes__max']
context['votes_max'] = votes_max
votes_per_day_average = int(votes_max / (today-suggestion.suggested_date).days)
context['votes_per_day_average'] = votes_per_day_average
# The number of votes per day for the last 7 days
votes_7_days = votes.filter(timestamp__gt=today-timedelta(days=7))
votes_max_min_7_days = votes_7_days.aggregate(Max('votes'), Min('votes'))
try:
votes_per_7_days_average = int((votes_max_min_7_days['votes__max'] - votes_max_min_7_days['votes__min']) / 7)
except:
votes_per_7_days_average = 0
context['votes_per_7_days_average'] = votes_per_7_days_average
days_left_of_suggestion = (suggestion.end_date-today).days
likely_to_succeed_votes_per_day = votes_max + days_left_of_suggestion * votes_per_day_average
context['likely_to_succeed_votes_per_day'] = likely_to_succeed_votes_per_day
likely_to_succeed_votes_per_7_days = votes_max + days_left_of_suggestion * votes_per_7_days_average
context['likely_to_succeed_votes_per_7_days'] = likely_to_succeed_votes_per_7_days
return render(request, 'borgerforslag/forslag.html', context)
Forsiden består af følgende skabelon:
{% extends "borgerforslag/base.html" %}
{% load static %}
{% block title %}Borgerforslag{% endblock %}
{% block content %}{% spaceless %}
<h1 class="display-4 mb-4">Borgerforslag</h1>
<table class="table table-striped"">
<caption>Liste over borgerforslag fra borgerforslag.dk</caption>
<tr>
<th><a title="Sorter efter stillet dato" href="{% url 'borgerforslag_index' %}?sort={% if request.GET.sort == "date" %}-date{% else %}date{% endif %}">Stillet dato</a></th>
<th><a title="Sorter efter forslagets titel" href="{% url 'borgerforslag_index' %}?sort={% if request.GET.sort == "title" %}-title{% else %}title{% endif %}">Titel</a></th>
<th><a title="Sorter efter antal støtter" href="{% url 'borgerforslag_index' %}?sort={% if request.GET.sort == "votes" %}-votes{% else %}votes{% endif %}">Støtter</a></th>
</tr>
{% for suggestion in all_suggestions %}
<tr>
<td class="text-nowrap">{{ suggestion.suggested_date }}</td>
<td><a href="{% url 'borgerforslag_forslag' suggestion.id %}">{{ suggestion.title }}{% if suggestion.votes < 50000 %} ({% if suggestion.end_date < today %}udløb{% else %}udløber{% endif %} {{ suggestion.end_date }}){% endif %}</td>
<td>{{ suggestion.votes }}</td>
</tr>
{% endfor %}
</table>
{% endspaceless %}{% endblock %}
Og forslagssiden genereres af denne skabelon:
{% extends "borgerforslag/base.html" %}
{% load static %}
{% block title %}Borgerforslag: {{ suggestion.title }}{% endblock %}
{% block content %}{% spaceless %}
<h1 class="display-4 mb-4">{{ suggestion.title }}</h1>
<p><a href="https://borgerforslag.dk{{ suggestion.url }}">Læs om forslaget på borgerforslag.dk</a></p>
<p><strong>Startdato:</strong> {{ suggestion.suggested_date }}</p>
<p><strong>Slutdato:</strong> {{ suggestion.end_date }}</p>
<p><strong>Støtter i alt:</strong> {{ votes_max }}{% if votes_max >= 50000 %} - forslaget vil blive fremsat som beslutningsforslag i Folketinget!{% endif %}</p>
<p>I gennemsnit {{ votes_per_day_average }} støtter per dag siden forslaget blev stillet. Hvis trenden fortsætter, opnår forslaget ca. {{ likely_to_succeed_votes_per_day }} støtter inden slutdatoen{% if likely_to_succeed_votes_per_day >= 50000 %} og vil blive fremsat som beslutningsforslag i Folketinget.{% else %}. Det er ikke nok til at blive fremsat som beslutningsforslag i Folketinget.{% endif %}</p>
<p>I gennemsnit {{ votes_per_7_days_average }} støtter per dag de sidste 7 dage. Hvis trenden fortsætter, opnår forslaget ca. {{ likely_to_succeed_votes_per_7_days }} støtter inden slutdatoen{% if likely_to_succeed_votes_per_7_days >= 50000 %} og vil blive fremsat som beslutningsforslag i Folketinget.{% else %}. Det er ikke nok til at blive fremsat som beslutningsforslag i Folketinget.{% endif %}</p>
<h2 class="mb-4">Udvikling i støtter for forslaget</h2>
{% if suggestion.suggested_date < start_of_data_collection_date %}<p>Obs! Forslaget blev fremsat før dataindsamlingen til denne side fra borgerforslag.dk startede den 19. oktober 2021. Der vises derfor ikke en komplet graf over udviklingen i støtter.</p>{% endif %}
<div id="chart"></div>
<script>
var options = {
title: {
text: 'Antal støtter over tid for forslaget {{ suggestion.title }}',
align: 'left'
},
chart: {
type: 'line',
locales: [{
"name": "da",
"options": {
"months": ["Januar", "Februar", "Marts", "April", "Maj", "Juni", "Juli", "August", "September", "Oktober", "November", "December"],
"shortMonths": ["Jan", "Feb", "Mar", "Apr", "Maj", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dec"],
"days": ["Søndag", "Mandag", "Tirsdag", "Onsdag", "Torsdag", "Fredag", "Lørdag"],
"shortDays": ["Søn", "Man", "Tir", "Ons", "Tor", "Fre", "Lør"],
"toolbar": {
"exportToSVG": "Download SVG",
"exportToPNG": "Download PNG",
"exportToCSV": "Download CSV",
"menu": "Menu",
"selection": "Valg",
"selectionZoom": "Zoom til valg",
"zoomIn": "Zoom ind",
"zoomOut": "Zoom ud",
"pan": "Panorer",
"reset": "Nulstil zoom"
}
}
}],
defaultLocale: "da",
},
series: [{
name: 'Støtter',
data: [{% for suggestion in votes_per_day %}{x: new Date('{{ suggestion.date|date:"Y-m-d" }}').getTime(), y: {{ suggestion.number_of_votes }}},{% endfor %}]
}],
xaxis: {
type: 'datetime',
title: {
text: 'Dato'
}
},
yaxis: {
title: {
text: 'Støtter'
}
},
}
var chart = new ApexCharts(document.querySelector("#chart"), options);
chart.render();
</script>
{% endspaceless %}{% endblock %}
Jeg bruger ApexCharts.js til at vise den fine graf over udviklingen i støtter for hvert enkelt forslag.
Hvordan udvikler antal underskrifter sig på Borgerforslag.dk?
På Twitter skrev Peter Brodersen:

Peters idé er sjov, synes jeg, så jeg er så småt begyndt at bygge et eller andet, der monitorerer hvordan antallet af underskrifter på borgerforslag udvikler sig over tid.
Så nu tygger min webserver sig igennem nedenstående script hvert 10. minut og gemmer det aktuelle antal underskrifter på hvert borgerforslag. Når der er gået nogle uger, vil jeg se om jeg kan lave nogle interessante visualiseringer af data.
import requests
from datetime import datetime
import locale
import psycopg2
from psycopg2 import Error
### PREPARATION ###
# Locale is set to Danish to be able to parse dates from Borgerforslag
locale.setlocale(locale.LC_TIME, ('da_DK', 'UTF-8'))
# API url and request parameters
url = 'https://www.borgerforslag.dk/api/proposals/search'
suggestions_per_request = 300
params_json = {
"filter": "active",
"sortOrder": "NewestFirst",
"searchQuery":"",
"pageNumber":0,
"pageSize": suggestions_per_request
}
# Connect to database
try:
connection = psycopg2.connect(user = "",
password = "",
host = "",
port = "",
database = "")
cursor = connection.cursor()
except (Exception, psycopg2.Error) as error:
print ("Error while connecting to PostgreSQL", error)
now = datetime.utcnow()
# Insert into database function
def insert_suggestion_and_votes(connection, suggestion):
with connection:
with connection.cursor() as cur:
try:
# See if suggestion already exists
sql = '''SELECT * FROM borgerforslag_suggestion WHERE unique_id = %s'''
cur.execute(sql, (suggestion['externalId'],))
suggestion_records = cur.fetchone()
# If not, add suggestion
if not suggestion_records:
suggestion_data = (suggestion['externalId'],suggestion['title'],suggestion['date'],suggestion['url'],suggestion['status'])
sql = '''INSERT INTO borgerforslag_suggestion(unique_id,title,suggested_date,url,status) VALUES(%s,%s,%s,%s,%s) RETURNING id'''
cur.execute(sql, suggestion_data)
id = cur.fetchone()[0]
# If yes, get id
else:
id = suggestion_records[0]
# Add votes
sql = '''INSERT INTO borgerforslag_vote(suggestion_id,timestamp,votes)
VALUES(%s,%s,%s)'''
cur.execute(sql, (id,now,suggestion['votes']))
except Error as e:
print(e, suggestion)
# Loop preparation
requested_results = 0
number_of_results = requested_results + 1
number_of_loops = 0
# Loop to get suggestions and add them to database
while requested_results < number_of_results and number_of_loops < 10:
response = requests.post(url, json=params_json)
json_response = response.json()
number_of_results = json_response['resultCount']
requested_results += suggestions_per_request
number_of_loops += 1
params_json['pageNumber'] += 1
for suggestion in json_response['data']:
suggestion['date'] = datetime.strptime(suggestion['date'], '%d. %B %Y') # convert date to datetime
insert_suggestion_and_votes(connection, suggestion)
RSS-feeds med Django
En Wallnot-bruger spurgte mig om ikke Wallnot burde have et RSS-feed? Jo da.
Det viste sig at den slags er indbygget i Django og meget nemt at lave.
Jeg oprettede feeds.py med to forskellige feeds. Et for alle artikler fra Wallnot, et for artikler for bestemte medier:
from django.contrib.syndication.views import Feed
from wall.models import Article
from django.urls import reverse
class RssFeed(Feed):
title = "Nyeste artikler fra wallnot.dk"
link = "/rss/"
description = "De allernyeste artikler uden paywall fra wallnot.dk"
def items(self):
return Article.objects.filter(paywall_detected=False).order_by('-date')[:20]
def item_title(self, item):
return item.title
def item_description(self, item):
return "Artikel fra " + item.get_medium_display()
def item_link(self, item):
return item.url
class RssFeedMedium(Feed):
title = "Nyeste artikler fra wallnot.dk"
description = "De allernyeste artikler uden paywall fra wallnot.dk"
def get_object(self, request, medium):
return medium
def link(self, obj):
return "/rss/" + obj
def items(self, obj):
return Article.objects.filter(paywall_detected=False, medium=obj).order_by('-date')[:20]
def item_title(self, item):
return item.title
def item_description(self, item):
return "Artikel fra " + item.get_medium_display()
def item_link(self, item):
return item.url
Og pegede på dem fra urls.py:
from django.urls import path
from . import views
from . import feeds
urlpatterns = [
path('', views.index, name='index'),
path('links', views.linkindex, name='linkindex'),
path('privatliv', views.privacy, name='privacy'),
path('om', views.about, name='wabout'),
path('rss', feeds.RssFeed(), name='rssfeed'),
path('rss/<str:medium>', feeds.RssFeedMedium(), name='rssfeed_medium'),
]
Og vupti!
Du finder RSS-feeds på wallnot.dk lige ved siden af Twitter-logoet.
ETF’er og fonde med aktiebeskatning 2021
For et par år siden blev det muligt at købe og tjene/tabe penge på aktiebaserede ETF’er og udenlandske investeringsfonde som aktieindkomst og ikke længere som kapitalindkomst.
Det eneste problem er/var, er at det velmenende regneark, der viser aktiebaserede investeringsselskaber, som er godkendt til den lavere beskatning på skat.dk, er en lille smule svært at bruge, når man gerne vil sammenligne værdipapirerne og finde ud af, hvor de kan købes.
Derfor har jeg lavet https://wallnot.dk/stocks.
Her kan du læse om, hvordan jeg gjorde.
- Jeg downloaded excelarket fra skat.dk
- Jeg tilføjede nogle kolonner og gemte som CSV-fil
- Jeg brugte Python til at hente data og links til værdipapirer hos Saxo Bank, Nordnet og Morningstar
- Jeg oprettede en app i Django og definerede en datamodel tilsvarende excelarket
- Jeg importerede data til Django
- Jeg byggede visningen
Nogle timers arbejde for mig. Forhåbentlig nogle sparede timer for dig.
Download af excelark
https://skat.dk/getfile.aspx?id=145013&type=xlsx
Tilføje nogle kolonner og gemme som CSV-fil
Lidt upædagogisk, men hvad:
Registreringsland/Skattemæssigt hjemsted;ISIN-kode;Navn;LEI kode;ASIDENT;CVR/SE/TIN;Venligt navn;Første registreringsår;Morningstar_id;Saxo_id;Nordnet_url;Nordnet_id;Nordnet_ÅOP;Nordnet_udbyttepolitik;Nordnet_prospekt;Saxo_url;Morningstar_prospekt;Morningstar_url;Morningstar_ÅOP
Hente data og links til værdipapirer
Ret sjusket Python-program. Men fungerer OK:
import csv
import requests
import re
import json
from bs4 import BeautifulSoup
def nordnet_cookies():
# Nordnet user account credentials
user = ''
password = ''
# A cookie dictionary for storing cookies
cookies = {}
# First part of cookie setting prior to login
url = 'https://classic.nordnet.dk/mux/login/start.html?cmpi=start-loggain&state=signin'
request = requests.get(url)
cookies['LOL'] = request.cookies['LOL']
cookies['TUX-COOKIE'] = request.cookies['TUX-COOKIE']
# Second part of cookie setting prior to login
url = 'https://classic.nordnet.dk/api/2/login/anonymous'
request = requests.post(url, cookies=cookies)
cookies['NOW'] = request.cookies['NOW']
# Actual login that gets us cookies required for later use
url = "https://classic.nordnet.dk/api/2/authentication/basic/login"
request = requests.post(url,cookies=cookies, data = {'username': user, 'password': password})
cookies['NOW'] = request.cookies['NOW']
cookies['xsrf'] = request.cookies['xsrf']
# Getting a NEXT cookie
url = "https://classic.nordnet.dk/oauth2/authorize?client_id=NEXT&response_type=code&redirect_uri=https://www.nordnet.dk/oauth2/"
request = requests.get(url, cookies=cookies)
cookies['NEXT'] = request.history[1].cookies['NEXT']
return cookies
def saxo_headers():
# Saxo user account credentials
user = ''
password = ''
# Visit login page and get AuthnRequest token value from input form
url = 'https://www.saxoinvestor.dk/Login/da/'
request = requests.get(url)
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"id":"AuthnRequest"})
authnrequest = input[0]["value"]
# Login step 1: Submit username, password and token and get another token back
url = 'https://www.saxoinvestor.dk/Login/da/'
request = requests.post(url, data = {'field_userid': user, 'field_password': password, 'AuthnRequest': authnrequest})
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"name":"SAMLResponse"})
# Most of the time this works
if input:
samlresponse = input[0]["value"]
# But sometimes there's a disclaimer that Saxo Bank would like you to accept
else:
input = soup.find_all('input')
inputs = {}
try:
for i in input:
inputs[i['name']] = i['value']
except:
pass
url = 'https://www.saxotrader.com/disclaimer'
request = requests.post(url, data=inputs)
cook = request.cookies['DisclaimerApp']
returnurl = cook[cook.find("ReturnUrl")+10:cook.find("&IsClientStation")]
url = 'https://live.logonvalidation.net/complete-app-consent/' + returnurl[returnurl.find("complete-app-consent/")+21:]
request = requests.get(url)
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"name":"SAMLResponse"})
samlresponse = input[0]["value"]
# Login step 2: Get bearer token necessary for API requests
url = 'https://www.saxoinvestor.dk/investor/login.sso.ashx'
r = requests.post(url, data = {'SAMLResponse': samlresponse})
bearer = r.history[0].headers['Location']
bearer = bearer[bearer.find("BEARER"):bearer.find("/exp/")]
bearer = bearer.replace("%20"," ")
# START API CALLS
# Documentation at https://www.developer.saxo/openapi/learn
# Set bearer token as header
headers = {'Authorization': bearer}
return headers
nordnet_cookies = nordnet_cookies()
saxo_headers = saxo_headers()
filename = 'Copy of ABIS liste 2021 - opdateret den 11-01-2021.csv'
output_file = 'stocks.csv'
get_nordnet = True
get_saxo = True
get_morningstar = True
with open(output_file, 'w', newline='') as output_csv:
paperwriter = csv.writer(output_csv, delimiter=';', quotechar ='"', quoting = csv.QUOTE_MINIMAL)
with open(filename) as csvfile:
paperreader = csv.reader(csvfile, delimiter=';')
for row in paperreader:
if row[1] != '0' and row[1] != 'ISIN-kode' and row[1] != '':
isin = row[1]
if get_morningstar == True:
morningstar = requests.get('https://www.morningstar.dk/dk/util/SecuritySearch.ashx?q=' + isin)
morningstar_text = morningstar.text
if morningstar_text:
first_hit = morningstar_text[morningstar_text.index("{"):morningstar_text.index("}")+1]
first_hit_json = json.loads(first_hit)
morningstar_id = first_hit_json['i']
morningstar_url = 'https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=' + morningstar_id
morningstar_info = requests.get(morningstar_url)
soup = BeautifulSoup(morningstar_info.text, "lxml")
try:
aop = soup.find(text=re.compile('Løbende omkostning'))
aop_value = aop.parent.next.next.next.next.next.next.next.string
if aop_value:
cleaned_aop = aop_value.replace(",",".").replace("%","")
else:
cleaned_aop = ''
except:
cleaned_aop = ''
morningstar_documents = requests.get('https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=' + morningstar_id + '&tab=12')
document_soup = BeautifulSoup(morningstar_documents.text, "lxml")
try:
prospect = document_soup.find(text=re.compile('CI'))
prospect_link = prospect.parent.next.next.next.next.next.next.next.next.a['href']
document_id = prospect_link[prospect_link.index("Id=")+3:prospect_link.rfind("&")]
document_url = 'https://doc.morningstar.com/document/' + document_id + '.msdoc'
except:
try:
prospect = document_soup.find(text=re.compile('Prospekt'))
prospect_link = prospect.parent.next.next.next.next.next.next.next.next.a['href']
document_id = prospect_link[prospect_link.index("Id=")+3:prospect_link.rfind("&")]
document_url = 'https://doc.morningstar.com/document/' + document_id + '.msdoc'
except:
document_url = ''
row[8] = morningstar_id
row[16] = document_url
row[17] = morningstar_url
row[18] = cleaned_aop
if get_saxo == True:
saxo = requests.get('https://www.saxotrader.com/openapi/ref/v1/instruments/?$top=201&$skip=0&includeNonTradable=true&AssetTypes=Stock,Bond,MutualFund,Etf,Etc,Etn,Fund,Rights,CompanyWarrant,StockIndex&keywords=' + isin + '&OrderBy=', headers=saxo_headers)
try:
saxo_json = saxo.json()
if saxo_json and saxo.status_code == 200:
try:
data = saxo_json['Data']
if data:
identifier = data[0]['Identifier']
assettype = data[0]['AssetType']
saxo_url = 'https://www.saxotrader.com/d/trading/product-overview?assetType=' + assettype + '&uic=' + str(identifier)
row[9] = identifier
row[15] = saxo_url
except Exception as e:
print(e)
breakpoint()
except:
pass
if get_nordnet == True:
nordnet = requests.get('https://www.nordnet.dk/api/2/main_search?query=' + isin + '&search_space=ALL&limit=60', cookies=nordnet_cookies)
nordnet_json = nordnet.json()
if nordnet_json and nordnet.status_code == 200:
try:
display_types = [hit['display_group_type'] for hit in nordnet_json]
except:
breakpoint()
good_hit = "wait"
try:
good_hit = display_types.index('ETF')
base_url = 'https://www.nordnet.dk/markedet/etf-lister/'
except:
try:
good_hit = display_types.index('PINV')
base_url = 'https://www.nordnet.dk/markedet/investeringsforeninger-liste/'
except:
try:
good_hit = display_types.index('FUND')
base_url = 'https://www.nordnet.dk/markedet/fondslister/'
except:
try:
bad_hit = display_types.index('NEWS')
except:
try:
good_hit = display_types.index('EQUITY')
base_url = 'https://www.nordnet.dk/markedet/aktiekurser/'
except:
breakpoint()
if good_hit != 'wait':
results = nordnet_json[good_hit]['results']
instrument_id = results[0]['instrument_id']
display_name = results[0]['display_name']
space_counter = 0
paper_url = ''
for letter in display_name:
if letter == " ":
space_counter += 1
if space_counter > 2:
break
letter = '-'
paper_url += letter
else:
letter = letter.lower()
paper_url += letter
full_url = base_url + str(instrument_id) + '-' + paper_url
if "&" in full_url:
full_url = full_url.replace("&","")
check_full_url = requests.get(full_url)
soup = BeautifulSoup(check_full_url.text, "lxml")
try:
policy = soup.find('span', text=re.compile('Udbyttepolitik'))
policy_value = policy.next.next.string
except:
policy_value = "Ukendt"
try:
prospectus = soup.find('span', text=re.compile('Faktaark'))
prospectus_value = prospectus.next.next.a['href']
cleaned_prospectus = prospectus_value[:prospectus_value.rfind("?")].replace('http','https')
except:
cleaned_prospectus = "Ukendt"
try:
aop = soup.find('span', text=re.compile('Årlig omkostning'))
aop_value = aop.next.next.get_text()
cleaned_aop = aop_value.replace(",",".").replace("%","")
except:
cleaned_aop = "Ukendt"
row[10] = check_full_url.url
row[11] = instrument_id
row[12] = cleaned_aop
row[13] = policy_value
row[14] = cleaned_prospectus
print(row)
paperwriter.writerow(row)
Datamodel i Django
Her er models.py:
from django.db import models
class Stock(models.Model):
country = models.CharField('Registreringsland', max_length=2)
isin = models.CharField('ISIN-kode', max_length=20, blank=True)
name = models.CharField('Navn', max_length=200, blank=True)
lei = models.CharField('LEI-kode', max_length=20, blank=True)
asident = models.CharField('ASIDENT', max_length=20, blank=True)
cvr = models.CharField('CVR/SE/TIN', max_length=20, blank=True)
friendly_name = models.CharField('Venligt navn', max_length=200, blank=True)
first_registration_year = models.CharField('Første registreringsår', max_length=4, blank=True)
morningstar_id = models.CharField('Morningstar: Id', max_length=20, blank=True)
saxo_id = models.CharField('Saxo Bank: Id', max_length=20, blank=True)
nordnet_id = models.CharField('Nordnet: Id', max_length=20, blank=True)
morningstar_url = models.URLField('Morningstar: Url', max_length=200, blank=True)
saxo_url = models.URLField('Saxo Bank: Url', max_length=200, blank=True)
nordnet_url = models.URLField('Nordnet: Url', max_length=200, blank=True)
morningstar_aop = models.FloatField('Morningstar: Løbende omkostninger', null=True, blank=True)
nordnet_aop = models.FloatField('Nordnet: Løbende omkostninger', null=True, blank=True)
nordnet_dividend = models.CharField('Nordnet: Udbyttepolitik', max_length=20, blank=True)
nordnet_prospect = models.URLField('Nordnet: Investorinformation', max_length=200, blank=True)
morningstar_prospect = models.URLField('Morningstar: Investorinformation', max_length=200, blank=True)
Importere data til Django
Her brugte jeg Django’s databasehåndtering i stedet for selv at skrive SQL-sætninger:
import csv
with open('stocks.csv', newline='\n') as csvfile:
reader = csv.DictReader(csvfile, delimiter=";")
count = 0
for row in reader:
stock = Stock(country = row['Registreringsland/Skattemæssigt hjemsted'])
if row['ISIN-kode']:
stock.isin = row['ISIN-kode']
if row['Navn']:
stock.name = row['Navn']
if row['LEI kode']:
stock.lei = row['LEI kode']
if row['ASIDENT']:
stock.asident = row['ASIDENT']
if row['CVR/SE/TIN']:
stock.cvr = row['CVR/SE/TIN']
if row['Venligt navn']:
stock.friendly_name = row['Venligt navn']
if row['Første registreringsår']:
stock.first_registration_year = row['Første registreringsår']
if row['Morningstar_id']:
stock.morningstar_id = row['Morningstar_id']
if row['Saxo_id']:
stock.saxo_id = row['Saxo_id']
if row['Nordnet_id']:
stock.nordnet_id = row['Nordnet_id']
if row['Morningstar_url']:
stock.morningstar_url = row['Morningstar_url']
if row['Saxo_url']:
stock.saxo_url = row['Saxo_url']
if row['Nordnet_url']:
stock.nordnet_url = row['Nordnet_url']
if row['Morningstar_ÅOP']:
stock.morningstar_aop = row['Morningstar_ÅOP']
if row['Nordnet_ÅOP'] and row['Nordnet_ÅOP'] != '-' and row['Nordnet_ÅOP'] != 'Ukendt':
stock.nordnet_aop = row['Nordnet_ÅOP']
if row['Nordnet_udbyttepolitik']:
stock.nordnet_dividend = row['Nordnet_udbyttepolitik']
if row['Nordnet_prospekt']:
stock.nordnet_prospect = row['Nordnet_prospekt']
if row['Morningstar_prospekt']:
stock.morningstar_prospect = row['Morningstar_prospekt']
stock.save()
count += 1
print(count)
Bygge visningen
Her er views.py:
from django.shortcuts import render
from .models import Stock
def index(request):
#FILTER LOGIC
if request.GET.get('filter'):
filter = request.GET.get('filter')
if filter == 'nordnetsaxo':
stocks = Stock.objects.exclude(nordnet_url='') | Stock.objects.exclude(saxo_url='')
elif filter == 'nordnet':
stocks = Stock.objects.exclude(nordnet_url='')
elif filter == 'saxo':
stocks = Stock.objects.exclude(saxo_url='')
elif filter == 'ikkenordnetsaxo':
stocks = Stock.objects.filter(nordnet_url='').filter(saxo_url='')
elif filter == 'alle':
stocks = Stock.objects.all()
else:
stocks = Stock.objects.exclude(nordnet_url='') | Stock.objects.exclude(saxo_url='')
#SORT LOGIC
sort = request.GET.get('sort')
print(sort)
if sort == "name" or not sort:
stocks = stocks.order_by('name')
elif sort == "-name":
stocks = stocks.order_by('-name')
elif sort == "isin":
stocks = stocks.order_by('isin')
elif sort == "-isin":
stocks = stocks.order_by('-isin')
elif sort == "morningstar_aop":
stocks = stocks.order_by('morningstar_aop')
elif sort == "-morningstar_aop":
stocks = stocks.order_by('-morningstar_aop')
elif sort == "nordnet_aop":
stocks = stocks.order_by('nordnet_aop')
elif sort == "-nordnet_aop":
stocks = stocks.order_by('-nordnet_aop')
context = {'stocks': stocks}
return render(request, 'stocks/index.html', context)
Og her er så skabelonen index.html:
{% extends "stocks/base.html" %}
{% load static %}
{% block title %}ETF'er og fonde med aktiebeskatning 2021{% endblock %}
{% block content %}{% spaceless %}
<h1>ETF'er og fonde med aktiebeskatning 2021</h1>
<p>Du har læst om, <a href="https://www.nordnet.dk/blog/nye-regler-for-beskatning-af-investeringsfonde/">at aktiebaserede ETF'er og udenlandske investeringsfonde fra 2020 beskattes som aktieindkomst og ikke længere som kapitalindkomst</a>.</p>
<p>Du har endda fundet <a href="https://skat.dk/getfile.aspx?id=145013&type=xlsx">det fine regneark, der viser aktiebaserede investeringsselskaber</a> på <a href="https://skat.dk/skat.aspx?oid=2244641">skat.dk</a>.</p>
<p>Men det er godt nok svært for dig at få overblik over, hvilke af papirerne du overhovedet kan købe som almindelig hobby-/cryptoinvestor, og at sammenligne omkostninger, ÅOP og hvad det ellers hedder, for at finde det rigtige køb.</p>
<p>Her er et forsøg på at løse dit (og mit) problem. Data kommer fra <a href="https://skat.dk/getfile.aspx?id=145013&type=xlsx">det fine regneark</a> og har samme fejl og mangler, men er suppleret med nyttige informationer og links.</p>
<p><a href="#forbehold">Du kan læse om forbehold nederst på siden</a> og du kan <a href="https://helmstedt.dk/2021/03/etfer-og-fonde-med-aktiebeskatning-2021/">læse om hvordan siden er lavet på min blog</a>.</p>
<p><strong>Vis til salg hos:</strong>
<form id="prefs">
<input type="radio" id="nordnetsaxo" name="filter" value="nordnetsaxo"{% if request.GET.filter == "nordnetsaxo" or not request.GET.filter %} checked{% endif %}>
<label title="Værdipapirer til salg hos Nordnet, Saxo Bank eller begge steder" for="nordnetsaxo">Nordnet og/eller Saxo Bank</label>
<input type="radio" id="nordnet" name="filter" value="nordnet"{% if request.GET.filter == "nordnet" %} checked{% endif %}>
<label title="Værdipapirer til salg hos Nordnet" for="nordnet">Nordnet</label>
<input type="radio" id="saxo" name="filter" value="saxo"{% if request.GET.filter == "saxo" %} checked{% endif %}>
<label title="Værdipapirer til salg hos Saxo Bank" for="saxo">Saxo Bank</label>
<input type="radio" id="ikkenordnetsaxo" name="filter" value="ikkenordnetsaxo"{% if request.GET.filter == "ikkenordnetsaxo" %} checked{% endif %}>
<label title="Værdipapirer, der hverken er til salg hos Nordnet eller Saxo Bank" for="ikkenordnetsaxo">Ikke Nordnet og/eller Saxo</label>
<input type="radio" id="alle" name="filter" value="alle"{% if request.GET.filter == "alle" %} checked{% endif %}>
<label title="Alle værdipapirer, både dem der kan købes hos Nordnet/Saxo Bank og de, der ikke kan" for="alle">Hele pivtøjet</label>
</form>
</p>
<table>
<tr>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "-name" %}name{% else %}-name{% endif %}">Navn</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "isin" %}-isin{% else %}isin{% endif %}">Isin</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "morningstar_aop" %}-morningstar_aop{% else %}morningstar_aop{% endif %}">Løbende omkostninger</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "nordnet_aop" %}-nordnet_aop{% else %}nordnet_aop{% endif %}">ÅOP</a></th>
<th>Investorinformation</th>
<th>Morningstar</th>
<th>Nordnet</th>
<th>Saxo</th>
</tr>
{% for stock in stocks %}
<tr>
<td>{{ stock.name }}</td>
<td>{{ stock.isin }}</td>
<td>{% if stock.morningstar_aop %}{{ stock.morningstar_aop }}%{% endif %}</td>
<td>{% if stock.nordnet_aop %}{{ stock.nordnet_aop }}%{% endif %}</td>
<td>{% if stock.nordnet_prospect %}<a href="{{ stock.nordnet_prospect }}">Info</a>{% elif stock.morningstar_prospect %}<a href="{{ stock.morningstar_prospect }}">Info</a>{% endif %}</td>
<td>{% if stock.morningstar_url %}<a href="{{ stock.morningstar_url }}">Link</a>{% endif %}</td>
<td>{% if stock.nordnet_url %}<a href="{{ stock.nordnet_url }}">Link</a>{% endif %}</td>
<td>{% if stock.saxo_url %}<a href="{{ stock.saxo_url }}">Link</a>{% endif %}</td>
</tr>
{% endfor %}
</table>
<a name="forbehold"></a>
<h2>Forbehold</h2>
<p>Alt hvad du læser på denne side er løgn og fiktion fra ende til anden og har ingen relation til virkeligheden. Hvis du kunne finde på at læse indholdet, som om det omhandlede værdipapirer, eller at købe, sælge eller tage dig af din personlige hygiejne med værdipapirer på grund af indholdet på denne side, er det fuldstændig et hundrede procent på eget ansvar. Alt hvad der findes på siden er fejlbehæftet, forældet og lavet af en uduelig amatør uden forstand på noget som helst. Du skal regne med, at alle links fører til nogle andre værdipapirer, end man skulle tro, og at de værdipapirer som står til salg et sted sikkert ikke sælges der - og omvendt. Alle oplysninger om løbende omkostninger og ÅOP er fundet ved hjælp af hønebingo og dermed så godt som tilfældige.</p>
{% endspaceless %}{% endblock %}
Books I read, or: Python and Django let me realise my nerdiest dreams
I like to document my doings and for about 15 years I’ve been documenting the books I have read. First in Notepad, then in Excel and finally in Python and Django with a database somewhere in the background. I am amazed what experts help amateurs achieve.
This post explains the proces of collecting data about my reads in little detail and in too great detail the code behind the page.


Finding information ONLINE
Most data was crawled from Danish library ressources, Goodreads and Wikpedia with varying success. A lot was entered manually, especially with works in translation. I spent hours and hours being pedantic.
Even though librarians have been managing data longer than anyone else on the planet, there is no autoritative relational database where you can look up when some book by some author was first published and when the first Danish language version came out. In defence of librarians, many writers go to great lengths to make data management on books hard (one example is the genre “non-fiction novel” used by Spanish writer Javier Cercas).
The mysteries of Goodreads
I was mystified by the ability of Goodreads to place study guides and commentary to great works of literature first in their search results (and many more strange things) and terrified by Google displaying available nowhere else I could find on the web author birthdays on top of search results .
Also, Goodreads magically has editions of books that are older than when Goodreads claims the book was first published.




Adding books
After crawling for data, I made a form to add new books:


The form
This was a breeze in Django. Here’s forms.py:
from django.forms import ModelForm
from books.models import Author, Title, Read
class AuthorForm(ModelForm):
class Meta:
model = Author
fields = ['first_name', 'last_name','gender','country','biography','birth_date','data_quality']
class TitleForm(ModelForm):
class Meta:
model = Title
fields = ['title','genre','read_language','original_language','publisher','isbn','published_date','first_published','cover_url','ereolen_url','biblo_dk_url','good_reads_url','pages','original_title']
class ReadForm(ModelForm):
class Meta:
model = Read
fields = ['date']
The view:
And here’s the logic from views.py (I probably shouldn’t uncritically be saving cover URLs found on the internet to my server, but):
# Add a read to database
@login_required
def add_read(request):
book_saved = False
author_form = AuthorForm()
title_form = TitleForm()
read_form = ReadForm()
if request.method == 'POST': # AND SUBMIT BUTTON
author_form = AuthorForm(request.POST)
title_form = TitleForm(request.POST)
read_form = ReadForm(request.POST)
if author_form.is_valid() and title_form.is_valid() and read_form.is_valid():
author_data = author_form.cleaned_data
title_data = title_form.cleaned_data
read_data = read_form.cleaned_data
existing_author = False
existing_title = False
# AUTHOR LOGIC - MAY ALSO MODIFY TITLE DATA
# Check if already exist
try:
author = Author.objects.get(first_name=author_data['first_name'], last_name=author_data['last_name'])
existing_author = True
context['existing_author'] = existing_author
except:
if 'lookup' in request.POST:
if any(not value for value in author_data.values()):
author_data, title_data = get_author(author_data, title_data) # try to fetch data
# TITLE LOGIC - MAY ALSO MODIFY AUTHOR DATA
# Check if title already exists, will only work is author has been found. (Book is re-read)
try:
if author:
title = Title.objects.get(authors=author, title=title_data['title'])
existing_title = True
context['existing_title'] = True
except:
if 'lookup' in request.POST:
if any(not value for value in title_data.values()):
title_data, author_data = get_title(title_data, author_data) # try to fetch data
# Render form with data from database or collected data
if 'lookup' in request.POST:
if not existing_author:
author_form = AuthorForm(author_data)
else:
author_form = AuthorForm(instance=author)
if not existing_title:
title_form = TitleForm(title_data)
else:
title_form = TitleForm(instance=title)
# Save data
if 'save' in request.POST:
if not existing_author:
author = author_form.save()
if not existing_title:
title = title_form.save()
title.authors.add(author)
if title.cover_url:
file = requests.get(title.cover_url, stream=True)
save_location = settings.STATIC_ROOT + "books/covers/"
if '.jpg' in title.cover_url:
ending = '.jpg'
elif '.png' in title.cover_url:
ending = '.png'
elif '.webp' in title.cover_url:
ending = '.webp'
else:
ending = '.jpg'
id = title.id
filename = str(id) + ending
with open(save_location+filename, 'wb') as f:
file.raw.decode_content = True
shutil.copyfileobj(file.raw, f)
title.cover_filename = filename
title.save()
#create thumbnail
image = Image.open(save_location+filename).convert("RGB")
maxsize = 150, 150
image.thumbnail(maxsize, Image.ANTIALIAS)
image.save(save_location+"150/"+str(id)+".webp", "WEBP")
save_read = read_form.save(commit=False)
save_read.title = title
save_read = read_form.save()
# Set save variable to True and display empty form
book_saved = True
author_form = AuthorForm()
title_form = TitleForm()
read_form = ReadForm()
context = {'author_form': author_form, 'title_form': title_form, 'read_form': read_form, 'book_saved': book_saved}
return render(request, 'books/add.html', context)
The helper function
If you are a really curious and patient individual, you may be wondering about the get_author and get_title functions. You are in luck! Here is most of helpers.py which helps me scrape some data from the internet and will probably break in the future:
# HELPER FUNCTIONS #
def numbers_in_string(string):
numbers = sum(character.isdigit() for character in string)
return numbers
def get_author(author_data, title_data):
# WIKIPEDIA
if not author_data['biography']:
if not author_data['country'] == 'da':
url = 'https://en.wikipedia.org/w/index.php?search=intitle%3A%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22&title=Special:Search&profile=advanced&fulltext=1&ns0=1'
else:
url = 'https://da.wikipedia.org/w/index.php?search=intitle%3A%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22&title=Special:Search&profile=advanced&fulltext=1&ns0=1'
else:
url = author_data['biography']
author_request = requests.get(url)
if author_request.status_code == 200:
soup = BeautifulSoup(author_request.text, "lxml")
try:
first_result = soup.find('div', {'class':'mw-search-result-heading'}).a['href']
if not author_data['country'] == 'da':
result_page = 'https://en.wikipedia.org' + first_result
else:
result_page = 'https://da.wikipedia.org' + first_result
page_request = requests.get(result_page)
soup = BeautifulSoup(page_request.text, "lxml")
# If not provided, set biography
if not author_data['biography']:
author_data['biography'] = result_page
# If not provided, try to get birth_date
if not author_data['birth_date']:
try:
birthday = soup.find('span', {'class':'bday'}).string
author_data['birth_date'] = datetime.strptime(birthday, '%Y-%m-%d')
except:
try:
birthday = soup.find('th', text="Født").parent.get_text()
# sometimes the above doesn't return a space between year and next info causing a fuckup
try:
find_year = re.search("\d\d\d\d\S", birthday).span()[1]
birthday = birthday[:find_year-1] + " " + birthday[find_year+-1:]
except:
pass
# sometimes even more fuckery
try:
letters_and_numbers_together = re.search("[a-zA-Z]\d", birthday).span()[1]
birthday = birthday[:letters_and_numbers_together-1] + " " + birthday[letters_and_numbers_together-1:]
except:
pass
birthday_date = search_dates(birthday,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
except:
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
if '(født' in text:
birth_mention = text.find('(født')
birth_string = text[birth_mention+1:text.find(")",birth_mention)]
if len(birth_string) < 10: # just a year, probably
year = int(birth_string[5:10])
birthday = date(year,1,1)
author_data['birth_date'] = birthday
else:
birthday_date = search_dates(birth_string,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
break
# If not provided, try to get country
if not author_data['country']:
try:
birthplace = soup.find('div', {'class':'birthplace'}).get_text()
except:
try:
birthplace = soup.find('th', text="Born").parent.get_text()
except:
pass
if birthplace:
country = get_country(birthplace)
if not country:
try:
birthplace = soup.find('th', text="Nationality").find_next_sibling().string
country = get_country(birthplace)
except:
pass
if country:
author_data['country'] = country
if not title_data['original_language']:
if country == 'us' or country == 'sc' or contry == 'ir' or country == 'en' or country == 'au':
country = 'en'
title_data['original_language'] = country
except:
pass
# GENDER
if not author_data['gender']:
request = requests.get('https://gender-api.com/get?name=' + author_data['first_name'] + '&key=vCjPrydWvlRcMxGszD')
response = request.json()
if response['gender'] == 'male':
author_data['gender'] = 'm'
elif response['gender'] == 'female':
author_data['gender'] = 'f'
if not author_data['data_quality']:
if author_data['first_name'] and author_data['last_name'] and author_data['gender'] and author_data['country'] and author_data['birth_date'] and author_data['biography']:
author_data['data_quality'] = 'med'
else:
author_data['data_quality'] = 'bad'
# WIKIPEDIA ALTERNATIVE, ONLY FOR BOOKS READ IN DANISH
if not author_data['biography'] and author_data['first_name'] and title_data['read_language'] == 'da':
url = 'https://litteraturpriser.dk/henv/' + author_data['last_name'][0].lower() + '.htm'
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
links = soup.find_all('a', href=True)
for link in links:
if len(link['href']) > 7:
text = link.get_text().lower()
if author_data['last_name'].lower() + ", " + author_data['first_name'].lower() == text:
url = 'https://litteraturpriser.dk' + link['href']
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
author_data['biography'] = request.url
if not author_data['country']:
author_data['country'] = 'da'
if not author_data['birth_date']:
born = soup.find(text=re.compile('Født'))
if born:
birthday_date = search_dates(born,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
else:
born = soup.find(text=re.compile('f. '))
birth_year = int(re.search("\d\d\d\d", born).group())
author_data['birth_date'] = date(birth_year,1,1)
if not title_data['original_language']:
title_data['original_language'] = 'da'
break
return author_data, title_data
def get_ereolen(title_data, author_data):
# EREOLEN
soup = ""
if not title_data['ereolen_url']:
if title_data['isbn']:
url = 'https://ereolen.dk/search/ting/' + title_data['isbn'] + '?&facets[]=facet.type%3Aebog'
else:
url = 'https://ereolen.dk/search/ting/' + author_data['first_name'] + " " + author_data['last_name']+ " " + title_data['title'] + '?&facets[]=facet.type%3Aebog'
request = requests.get(url)
try:
search_soup = BeautifulSoup(request.text, "lxml")
links = [a['href'] for a in search_soup.find_all('a', href=True) if '/collection/' in a['href']]
book_request = requests.get('https://ereolen.dk' + links[0])
soup = BeautifulSoup(book_request.text, "lxml")
links = [a['href'] for a in soup.find_all('a', href=True) if '/object/' in a['href']]
# ebooks and audiobook versions
if len(links) == 4:
book_request = requests.get('https://ereolen.dk' + links[0])
soup = BeautifulSoup(book_request.text, "lxml")
# SAVE HIT URL
title_data['ereolen_url'] = 'https://ereolen.dk' + links[0]
except:
pass
else:
book_request = title_data['ereolen_url']
book_request = requests.get(book_request)
soup = BeautifulSoup(book_request.text, "lxml")
if soup:
if not title_data['published_date']:
try:
published = soup.find('div', class_={"field-name-ting-author"}).get_text()
published = int(re.search("[(]\d\d\d\d[)]", published).group()[1:5])
title_data['published_date'] = date(published,1,1)
except:
pass
if not title_data['isbn']:
try:
isbn_tag = soup.find('div', class_={"field-name-ting-details-isbn"})
title_data['isbn'] = isbn_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['publisher']:
try:
publisher_tag = soup.find('div', class_={"field-name-ting-details-publisher"})
title_data['publisher'] = publisher_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['pages']:
try:
page_tag = soup.find('div', class_={"field-name-ting-details-extent"})
title_data['pages'] = int(page_tag.find('div', class_={"field-items"}).get_text().replace(" sider",""))
except:
pass
if not title_data['original_title']:
try:
original_title_tag = soup.find('div', class_={"field-name-ting-details-source"})
title_data['original_title'] = original_title_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['cover_url']:
covers = [img['src'] for img in soup.find_all('img') if '/covers/' in img['src']]
title_data['cover_url'] = covers[0][:covers[0].find("?")]
return title_data, author_data
def get_bibliotek_dk(title_data, author_data):
search_url = 'https://bibliotek.dk/da/search/work?search_block_form=phrase.creator%3D%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22+and+phrase.title%3D%22' + title_data['title'] + '%22&select_material_type=bibdk_frontpage&op=S%C3%B8g&n%2Famaterialetype%5Bterm.workType%253D%2522literature%2522%5D=term.workType%253D%2522literature%2522&year_op=%2522year_eq%2522&year_value=&form_id=search_block_form&sort=rank_main_title&page_id=bibdk_frontpage'
request = requests.get(search_url)
soup = BeautifulSoup(request.text, "lxml")
hits = soup.find_all('div', {'class':'work mobile-page'})
if not hits:
url = 'https://bibliotek.dk/da/search/work?search_block_form=' + author_data['first_name'] + " " + author_data['last_name'] + " " + title_data['title'] +'&select_material_type=bibdk_frontpage%2Fbog&op=S%C3%B8g&n%2Famaterialetype%5Bterm.workType%253D%2522literature%2522%5D=term.workType%253D%2522literature%2522&year_op=%2522year_eq%2522&year_value=&form_build_id=form-TQ8TlT3HGFiKXyvz6cCFaiuTMZKimuHMF-p4q1Mb8ZI&form_id=search_block_form&sort=rank_main_title&page_id=bibdk_frontpage#content'
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
hits = soup.find_all('div', {'class':'work mobile-page'})
for hit in hits:
id = hit['id']
title = hit.find('h2', {'class':'searchresult-work-title'}).get_text()
author = hit.h3.get_text()
if title_data['title'].lower() in title.lower() or title.lower() in title_data['title'].lower() or len(hits) == 1:
if 'basis' in id:
link = id.replace("basis","-basis:")
elif 'katalog' in id:
link = id.replace("katalog","-katalog:")
biblo_url = 'https://bibliotek.dk/da/work/' + link
request = requests.get(biblo_url)
if not title_data['biblo_dk_url']:
title_data['biblo_dk_url'] = biblo_url
soup = BeautifulSoup(request.text, "lxml")
if not title_data['cover_url']:
try:
img = soup.find('div', {'class':'bibdk-cover'}).img['src'].replace("/medium/","/large/")
img = img[:img.find("?")]
title_data['cover_url'] = img
except:
pass
book_data = soup.find('div', {'class':'manifestation-data'})
if not title_data['pages']:
try:
pages = book_data.find('div', {'class':'field-name-bibdk-mani-format'}).find('span', {'class':'openformat-field'}).string.strip()
pages = pages[:pages.find(" ")]
pages = int(pages)
title_data['pages'] = pages
except:
pass
if not title_data['publisher']:
try:
publisher = book_data.find('div', {'class':'field-name-bibdk-mani-publisher'}).find('span', {'property':'name'}).string
title_data['publisher'] = publisher
except:
pass
if not title_data['published_date'] or not title_data['first_published']:
try:
first_published = book_data.find('div', {'class':'field-name-bibdk-mani-originals'}).find('span', {'class':'openformat-field'}).string.strip()
published = int(re.search("\d\d\d\d", first_published).group())
if not title_data['published_date']:
title_data['published_date'] = date(published,1,1)
if not title_data['first_published'] and title_data['read_language'] == 'da' and title_data['original_language'] == 'da':
title_data['first_published'] = date(published,1,1)
except:
try:
pub_year = int(book_data.find('div', {'class':'field-name-bibdk-mani-pub-year'}).find('span', {'class':'openformat-field'}).string.strip())
title_data['published_date'] = date(pub_year,1,1)
if title_data['read_language'] == 'da' and title_data['original_language'] == 'da':
try:
edition = book_data.find('div', {'class':'field-name-bibdk-mani-edition'}).find('span', {'class':'openformat-field'}).string.strip()
if edition == "1. udgave":
title_data['first_published'] = date(pub_year,1,1)
except:
pass
except:
pass
break
return title_data, author_data
def get_goodreads(title_data, author_data):
if not title_data['good_reads_url']:
searchterm = author_data['first_name'] + " " + author_data['last_name'] + " " + title_data['title']
search_url = 'https://www.goodreads.com/search?utf8=✓&q=' + searchterm + '&search_type=books'
response = requests.get(search_url)
search_soup = BeautifulSoup(response.text, "lxml")
all_results = search_soup.find_all('tr', {'itemtype':'http://schema.org/Book'})
if not all_results:
search_url = 'https://www.goodreads.com/search?utf8=✓&q=' + title_data['title'] + '&search_type=books'
response = requests.get(search_url)
search_soup = BeautifulSoup(response.text, "lxml")
all_results = search_soup.find_all('tr', {'itemtype':'http://schema.org/Book'})
if all_results:
good_match = False
#exact match
for result in all_results:
gr_author = result.find('span', {'itemprop':'author'}).get_text().strip()
gr_author = gr_author.replace(' (Goodreads Author)','')
if " " in gr_author:
gr_author = gr_author.replace(" "," ")
elif " " in gr_author:
gr_author = gr_author.replace(" "," ")
gr_title = result.find('a', {'class':'bookTitle'})
gr_title_string = gr_title.get_text().strip()
title_url = gr_title['href']
if gr_title_string.lower() == title_data['title'].lower() and gr_author.lower() == author_data['first_name'].lower() + " " + author_data['last_name'].lower():
good_match = True
break
if good_match == True:
url = 'https://www.goodreads.com' + title_url
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
else:
links = search_soup.find_all('a', href=True)
books = [a['href'] for a in links if '/book/show/' in a['href']]
for book in books:
if not 'summary' in book and not 'analysis' in book and not 'lesson-plan' in book and not 'sidekick' in book and not 'teaching-with' in book and not 'study-guide' in book and not 'quicklet' in book and not 'lit-crit' in book and not author_data['last_name'].lower() in book:
url = 'https://www.goodreads.com' + book
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
heading = soup.find('h1', {'id': 'bookTitle'}).string
break
else:
url = title_data['good_reads_url']
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
if not title_data['good_reads_url']:
if '?' in url:
url = url[:url.rfind("?")]
title_data['good_reads_url'] = url
if not title_data['cover_url']:
try:
title_data['cover_url'] = soup.find('img', {"id" : "coverImage"})['src'].replace("compressed.","")
except:
pass
details = soup.find('div', {"id" : "details"})
details_text = details.get_text()
if not title_data['published_date']:
possible_dates = details.find_all('div', attrs={'class':'row'})
for item in possible_dates:
published_date = item.find(text=re.compile("Published"))
if published_date:
published_date = published_date.strip()
numbers = numbers_in_string(published_date)
if numbers > 4:
title_data['published_date'] = search_dates(published_date,languages=['en'])[0][1]
elif numbers == 4:
year = int(re.search("\d\d\d\d", published_date).group())
title_data['published_date'] = date(year,1,1)
if not title_data['first_published']:
try:
first_published = details.find('nobr').string.strip()
numbers = numbers_in_string(first_published)
if numbers > 4:
title_data['first_published'] = search_dates(first_published,languages=['en'])[0][1]
elif numbers == 4:
year = int(re.search("\d\d\d\d", first_published).group())
title_data['first_published'] = date(year,1,1)
except:
pass
if not title_data['pages']:
try:
pages = details.find('span', {'itemprop': 'numberOfPages'}).string
title_data['pages'] = int(pages[:pages.find(" ")])
except:
pass
if not title_data['publisher']:
try:
by_location = details_text.find("by ")
title_data['publisher'] = details_text[by_location+3:details_text.find("\n", by_location)]
except:
pass
if not title_data['isbn']:
try:
isbn = re.search("\d\d\d\d\d\d\d\d\d\d\d\d\d", details_text).group()
title_data['isbn'] = isbn
except:
try:
isbn = re.search("\d\d\d\d\d\d\d\d\d\d", details_text).group()
title_data['isbn'] = isbn
except:
pass
if not title_data['original_title'] and title_data['read_language'] != title_data['original_language']:
try:
parent = details.find('div', text="Original Title").parent
original_title = parent.find('div', {'class':'infoBoxRowItem'}).string
title_data['original_title'] = original_title
except:
pass
return title_data, author_data
def get_title(title_data, author_data):
if title_data['read_language'] == 'da':
title_data, author_data = get_ereolen(title_data, author_data)
title_data, author_data = get_bibliotek_dk(title_data, author_data)
title_data, author_data = get_goodreads(title_data, author_data)
#cover from ereolen, mofibo, saxo
# danish library request
else:
title_data, author_data = get_goodreads(title_data, author_data)
return title_data, author_data
The template
The simplicity:
<h1>Add book</h1>
{% if book_saved %}
<p>Bogen blev gemt!</p>
{% endif %}
<form method="post">
<p class="center"><input class="button blue" name="lookup" type="submit" value="Look up">
<input class="button green" name="save" type="submit" value="Save"></p>
<p class="center">
{% if author_form.biography.value %}
<a href="{{ author_form.biography.value }}">biografi</a>
{% endif %}
{% if title_form.good_reads_url.value %}
<a href="{{ title_form.good_reads_url.value }}">goodreads</a>
{% endif %}
{% if title_form.ereolen_url.value %}
<a href="{{ title_form.ereolen_url.value }}">ereolen</a>
{% endif %}
{% if title_form.biblo_dk_url.value %}
<a href="{{ title_form.biblo_dk_url.value }}">bibliotek.dk</a>
{% endif %}
</p>
{% csrf_token %}
<div class="grid addbook">
<div>
{{ author_form }}
</div>
<div>
{{ title_form }}
</div>
<div>
{{ read_form }}
{% if title_form.cover_url.value %}
<img class="cover" src="{{ title_form.cover_url.value }}">
{% endif %}
</div>
</div>
</form>
The data model
Here’s models.py with the embarrassing list of countries and languages (that I should have gotten from somewhere else) edited out:
from isbn_field import ISBNField
class Author(models.Model):
GENDER_CHOICES = [
('f', 'Female'),
('m', 'Male'),
('o', 'Other'),
]
DATA_QUALITY_CHOICES = [
('good', 'Good'),
('bad', 'Bad'),
('med', 'Medium'),
]
first_name = models.CharField('First name', max_length=500, blank=True)
last_name = models.CharField('Last name', max_length=500)
def __str__(self):
return self.first_name + " " + self.last_name
def get_titles(self):
return " & ".join([t.title for t in self.title_set.all()])
gender = models.CharField('Gender', choices=GENDER_CHOICES, max_length=1, blank=True)
birth_date = models.DateField(null=True, blank=True)
country = models.CharField('Country', choices=COUNTRY_CHOICES, max_length=2, blank=True)
biography = models.URLField('Biography url', max_length=500, blank=True)
data_quality = models.CharField('Datakvalitet', choices=DATA_QUALITY_CHOICES, max_length=4, blank=True)
class Meta:
ordering = ['last_name']
class Title(models.Model):
GENRE_CHOICES = [
('nf', 'Non-Fiction'),
('fi', 'Fiction'),
]
authors = models.ManyToManyField(Author)
def get_authors(self):
return " & ".join([t.first_name + " " + t.last_name for t in self.authors.all()])
get_authors.short_description = "Author(s)"
title = models.CharField('Title', max_length=500)
def __str__(self):
return self.title
read_language = models.CharField('Read in language', choices=LANGUAGE_CHOICES, max_length=2)
original_language = models.CharField('Original language', choices=LANGUAGE_CHOICES, max_length=2, blank=True)
original_title = models.CharField('Original title', max_length=500, blank=True)
genre = models.CharField('Overall genre', choices=GENRE_CHOICES, max_length=2)
publisher = models.CharField('Publisher', max_length=100, blank=True)
first_published = models.DateField(null=True, blank=True)
published_date = models.DateField(null=True, blank=True)
isbn = ISBNField(null=True, blank=True)
cover_filename = models.CharField('Cover filename', max_length=100, blank=True)
cover_url = models.URLField('Cover-url', max_length=500, blank=True)
pages = models.PositiveIntegerField(blank=True, null=True)
good_reads_url = models.URLField('Goodreads-url', max_length=500, blank=True)
ereolen_url = models.URLField('Ereolen-url', max_length=500, blank=True)
biblo_dk_url = models.URLField('Biblo-url', max_length=500, blank=True)
class Meta:
ordering = ['title']
class Read(models.Model):
title = models.ForeignKey(Title, on_delete=models.CASCADE)
date = models.DateField()
sort_order = models.PositiveIntegerField(blank=True, null=True)
The front page
The views.py function for the front page is short and sweet:
def index(request):
context = {}
context['request'] = request
reads = Read.objects.order_by('-date__year', 'date__month','sort_order','id').select_related('title')
context['reads'] = reads
context['months'] = [[i, calendar.month_abbr[i]] for i in range(1,13)]
return render(request, 'books/index.html', context)
And, while longer, I think the template loop is nice too, (although there is that clumsy nested loop):
{% regroup reads by date.year as years_list %}
{% for year, readings in years_list %}
<h2>{{ year }}</h2>
{% if year == 2015 %}
<p>I was on paternity leave most of this year which gave me time to read a lot, but not the mental surplus to register by month. This year I bought a Kindle which re-kindled (durr) my interest in reading.</p>
{% elif year == 2004 %}
<p>I was working in England from around September 2003 to February 2004. This gave me time to read a lot, but not the computer access at home necessary to register my reads precisely.</p>
{% elif year == 2003 %}
<p>The year I began registering my reads.</p>
{% elif year == 2002 %}
<p>This - and all years before - is from memory in 2003, so not really precise.</p>
{% endif %}
{% regroup readings by date.month as months_list %}
{% if year > 2004 and not year == 2015 %}
<div class="grid reads">
{% for month in months %}
<div class="flex">
<div>{{ month.1 }}</div>
{% for mon, reads in months_list %}
{% if mon == month.0 %}
{% for read in reads %}
<a title="{{ read.title }}" href="{% url 'books_book' read.title.id %}"><img class="frontcover" loading="lazy" src="{% static 'books/covers/150/' %}{{ read.title.id }}.webp"></a>
{% endfor %}
{% endif %}
{% endfor %}
</div>
{% endfor %}
</div>
{% else %}
{% for read in readings %}
<a href="{% url 'books_book' read.title.id %}"><img class="frontcover" loading="lazy" src="{% static 'books/covers/150/' %}{{ read.title.id }}.webp"></a>
{% endfor %}
{% endif %}
The statistics page
The charts on the statistics page are made with Chart.js which is so easy that you don’t even need to know Javascript.
Here’s the views.py function which could probably be sped up if I had any idea how (which I don’t):
def statistics(request):
context = {}
# All reads, used for lots of charts
reads = Read.objects.order_by('date__year').select_related('title').prefetch_related('title__authors')
context['reads'] = reads
# Books per year chart queryset
books_pages_per_year = Read.objects.values('date__year').annotate(Count('id'), Sum('title__pages'), Avg('title__pages')).order_by('date__year')
context['books_pages_per_year'] = books_pages_per_year
# Prepare year, value-dictionaries
genre_structure = {} # fiction vs. non-fiction
author_gender_structure = {} # male vs. female
author_birth_structure = {} # median age of authors
read_language_structure = {} # language of read
original_language_structure = {} # original language of read
language_choices = dict(Title.LANGUAGE_CHOICES) # look up dict for original languages
author_country_structure = {} # country of author
country_choices = dict(Author.COUNTRY_CHOICES)
book_age_structure = {} # median age of books
for read in reads:
year_of_read = read.date.year
# Put year keys in dictionaries
if not year_of_read in genre_structure: # check one = check all
genre_structure[year_of_read] = []
author_gender_structure[year_of_read] = []
author_birth_structure[year_of_read] = []
read_language_structure[year_of_read] = []
original_language_structure[year_of_read] = []
author_country_structure[year_of_read] = []
book_age_structure[year_of_read] = []
# Put values in dictionaries
if read.title.read_language == 'da' or read.title.read_language == 'en':
read_language_structure[year_of_read].append(read.title.read_language)
if read.title.original_language:
original_language_structure[year_of_read].append(language_choices[read.title.original_language])
if read.title.genre:
genre_structure[year_of_read].append(read.title.genre)
if read.title.first_published:
book_age_structure[year_of_read].append(read.title.first_published.year)
for author in read.title.authors.all():
if author.gender:
author_gender_structure[year_of_read].append(author.gender)
if author.birth_date:
author_birth_structure[year_of_read].append(author.birth_date.year)
if author.country:
author_country_structure[year_of_read].append(country_choices[author.country])
# Prepare datasets for charts
genres = {}
for year, genre_list in genre_structure.items():
number_of_titles = len(genre_list)
number_of_fiction_titles = sum(1 for genre in genre_list if genre == 'fi')
fiction_percentage = int(number_of_fiction_titles/number_of_titles*100)
non_fiction_percentage = 100 - fiction_percentage
genres[year] = [fiction_percentage, non_fiction_percentage]
context['genres'] = genres
median_author_age = {}
for year, birthyears in author_birth_structure.items():
birthyears = sorted(birthyears)
median_birthyear = birthyears[len(birthyears) // 2]
median_author_age[year] = year - median_birthyear
context['median_author_age'] = median_author_age
author_genders = {}
for year, genders in author_gender_structure.items():
number_of_authors = len(genders)
males = sum(1 for gender in genders if gender == 'm')
male_percentage = int(males/number_of_authors*100)
female_percentage = 100 - male_percentage
author_genders[year] = [male_percentage, female_percentage]
context['author_genders'] = author_genders
read_languages = {}
for year, languages in read_language_structure.items():
number_of_languages = len(languages)
danish = sum(1 for language in languages if language == 'da')
danish_percentage = int(danish / number_of_languages * 100)
english_percentage = 100 - danish_percentage
read_languages[year] = [danish_percentage, english_percentage]
context['read_languages'] = read_languages
original_languages = []
original_languages_years = []
for year, languages in original_language_structure.items():
if not year in original_languages_years:
original_languages_years.append(year)
for lang in languages:
if lang not in original_languages:
original_languages.append(lang)
original_languages_template = {}
for language in original_languages:
original_languages_template[language] = []
for year in original_languages_years:
count_of_language_in_year = sum(1 for lang in original_language_structure[year] if language == lang)
original_languages_template[language].append(count_of_language_in_year)
context['original_languages_template'] = original_languages_template
context['original_languages_years'] = original_languages_years
author_countries = []
author_countries_years = []
for year, countries in author_country_structure.items():
if not year in author_countries_years:
author_countries_years.append(year)
for country in countries:
if country not in author_countries:
author_countries.append(country)
author_countries_template = {}
for country in author_countries:
author_countries_template[country] = []
for year in author_countries_years:
count_of_country_in_year = sum(1 for countr in author_country_structure[year] if country == countr)
author_countries_template[country].append(count_of_country_in_year)
context['author_countries_template'] = author_countries_template
context['author_countries_years'] = author_countries_years
median_book_age = {}
for year, publish_years in book_age_structure.items():
publish_years = sorted(publish_years)
# account for no data in years
if len(publish_years) >= 2:
median_publish_year = publish_years[len(publish_years) // 2]
elif len(publish_years) == 1:
median_publish_year = publish_years[0]
else:
median_publish_year = 0
median_book_age[year] = year - median_publish_year
context['median_book_age'] = median_book_age
return render(request, 'books/statistics.html', context)
And a template example:
<div>
<h2>Reads per year</a>
<canvas id="books_per_year"></canvas>
</div>
<script>
var ctx = document.getElementById('books_per_year').getContext('2d');
var myChart = new Chart(ctx, {
type: 'bar',
data: {
labels: [{% for year in books_pages_per_year %}{% if not forloop.last %}{{ year.date__year }}, {% else %}{{ year.date__year }}{% endif %}{% endfor %}],
datasets: [{
label: 'Read',
data: [{% for year in books_pages_per_year %}{% if not forloop.last %}{{ year.id__count }}, {% else %}{{ year.id__count }}{% endif %}{% endfor %}],
backgroundColor: 'rgba(255, 99, 132, 0.2)',
borderColor: 'rgba(255, 99, 132, 1)',
borderWidth: 1
}]
},
options: {
tooltips: {
callbacks: {
label: function(tooltipItem, data) {
return data.datasets[tooltipItem.datasetIndex].label + ': ' + tooltipItem.value + ' books';
}
}
},
legend: {
display: false
},
responsive: true,
scales: {
yAxes: [{
ticks: {
beginAtZero: true
}
}]
}
}
});
</script>