# -*- coding: utf-8 -*-
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
"""This program logs into a Saxo Bank account and lets you make API requests."""
import requests
from datetime import datetime
from datetime import date
from bs4 import BeautifulSoup
# USER ACCOUNT AND PERIOD DATA. SHOULD BE EDITED FOR YOUR NEEDS #
# Saxo user account credentials
user = '' # your user id
password = '' # your password
# Start date (start of period for transactions) and date today used for extraction of transactions
startdate = '2019-01-01'
today = date.today()
enddate = datetime.strftime(today, '%Y-%m-%d')
# LOGIN TO SAXO BANK
# Start requests session and set user agent
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
# Visit login page and get AuthnRequest token value from input form
url = 'https://www.saxoinvestor.dk/Login/da/'
request = session.get(url)
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"id":"AuthnRequest"})
authnrequest = input[0]["value"]
# Login step 1: Submit username, password and token and get another token back
url = 'https://www.saxoinvestor.dk/Login/da/'
request = session.post(url, data = {'field_userid': user, 'field_password': password, 'AuthnRequest': authnrequest})
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"name":"SAMLResponse"})
# Most of the time this works
if input:
samlresponse = input[0]["value"]
# But sometimes there's a disclaimer that Saxo Bank would like you to accept
else:
input = soup.find_all('input')
inputs = {}
try:
for i in input:
inputs[i['name']] = i['value']
except:
pass
url = 'https://www.saxotrader.com/disclaimer'
request = session.post(url, data=inputs)
cook = request.cookies['DisclaimerApp']
returnurl = cook[cook.find("ReturnUrl")+10:cook.find("&IsClientStation")]
url = 'https://live.logonvalidation.net/complete-app-consent/' + returnurl[returnurl.find("complete-app-consent/")+21:]
request = session.get(url)
soup = BeautifulSoup(request.text, "html.parser")
input = soup.find_all('input', {"name":"SAMLResponse"})
samlresponse = input[0]["value"]
# Login step 2: Get bearer token necessary for API requests
url = 'https://www.saxoinvestor.dk/investor/login.sso.ashx'
response = session.post(url, data = {'SAMLResponse': samlresponse})
response_text = response.text
bearer = response_text[response_text.find("BEARER"):response_text.find("/exp/")]
# START API CALLS
# Documentation at https://www.developer.saxo/openapi/learn
# Set bearer token as header
headers = {'Authorization': bearer}
# First API request gets Client Key which is used for most API calls
# See https://www.developer.saxo/openapi/learn/the-tutorial for expected return data
url = 'https://www.saxoinvestor.dk/openapi/port/v1/clients/me'
r = requests.get(url, headers=headers)
clientdata = r.json()
clientkey = clientdata['ClientKey']
# Example API call #1
url = 'https://www.saxoinvestor.dk/openapi/cs/v1/reports/aggregatedAmounts/' + clientkey + '/' + startdate + '/' + enddate + '/'
r = requests.get(url, headers=headers)
data = r.json()
# Working on that data to add some transaction types to personal system
saxoaccountname = "Aktiesparekonto: Saxo Bank"
currency = "DKK"
saxotransactions = ""
for item in data['Data']:
if item['AffectsBalance'] == True:
date = item['Date']
amount = item['Amount']
amount_str = str(amount).replace(".",",")
if item['UnderlyingInstrumentDescription'] == 'Cash deposit or withdrawal' or item['UnderlyingInstrumentDescription'] == 'Cash inter-account transfer':
if amount > 0:
transactiontype = 'INDBETALING'
elif amount < 0:
transactiontype = 'HÆVNING'
saxotransactions += ";" + date + ";" + date + ";" + date + ";" + transactiontype + ";;;;;;;;" + amount_str + ";" + currency + ";;;;;;;;;" + saxoaccountname + "\r\n"
if item['AmountTypeName'] == 'Corporate Actions - Cash Dividends':
transactiontype = "UDB."
if item['InstrumentDescription'] == "Novo Nordisk B A/S":
paper = "Novo B"
papertype = "Aktie"
if item['InstrumentDescription'] == "Tryg A/S":
paper = "TRYG"
papertype = "Aktie"
saxotransactions += ";" + date + ";" + date + ";" + date + ";" + transactiontype + ";" + paper + ";" + papertype + ";;;;;;" + amount_str + ";" + currency + ";;;;;;;;;" + saxoaccountname + "\n"
# Example API call #2
url = "https://www.saxoinvestor.dk/openapi/cs/v1/reports/trades/" + clientkey + "?fromDate=" + startdate + "&" + "toDate=" + enddate
r = requests.get(url, headers=headers)
data = r.json()
# Working on that data to add trades to personal system
for item in data['Data']:
date = item['AdjustedTradeDate']
numberofpapers = str(int(item['Amount']))
amount_str = str(item['BookedAmountAccountCurrency']).replace(".",",")
priceperpaper = str(item['BookedAmountAccountCurrency'] / item['Amount']).replace(".",",")
if item['TradeEventType'] == 'Bought':
transactiontype = "KØBT"
if item['AssetType'] == 'Stock':
papertype = "Aktie"
if item['InstrumentDescription'] == "Novo Nordisk B A/S":
paper = "Novo B"
isin = "DK0060534915"
if item['InstrumentDescription'] == "Tryg A/S":
paper = "TRYG"
isin = "DK0060636678"
saxotransactions += ";" + date + ";" + date + ";" + date + ";" + transactiontype + ";" + paper + ";" + papertype + ";" + isin + ";" + numberofpapers + ";" + priceperpaper + ";;;" + amount_str + ";" + currency + ";;;;;;;;;" + saxoaccountname + "\n"
Af forskellige grunde er mit styresystem og min browser (Firefox) engelsksproget. Og af uransagelige årsager har Nordnet lanceret et ufærdigt engelsk browserinterface, hvor hele navigationen mangler.
Her er menuen med dansk interface:
Og sådan ser det ud på engelsk:
Der mangler noget!
Som den gode samfundsborger jeg er, forsøgte jeg at få problemet løst. Men hvordan?
Kundeservice!
Jeg startede med at skrive en flink besked:
Og fik et flinkt, men ubrugeligt, standardsvar:
Blev jeg læst? Jeg følte mig i hvert fald ikke rigtigt set. Det er fint, at Nordnet lytter til kunders “tilbagemeldinger”, men som man kan læse om i managementlitteraturen, handler indtryk af virksomheder/organisationer ikke om, hvad de siger de gør, men om hvad de rent faktisk gør (hvis det altså bliver opdaget, det de gør).
Jeg prøver igen…
Lidt mindre høfligt, lidt mere tydeligt:
Min begyndende skepsis mødes med mistro. Er du sikker på, du ser, hvad du ser og dokumenterer for os med fine skærmbilleder?
Jeg skruer op for dokumentationen og vedhæfter en notits, jeg fik, om at bekræfte mine kontaktoplysninger. Den ville være sjov, hvis det ikke var så hamrende uprofessionelt at vise sådan noget til sine kunder. Ej, den er faktisk lidt sjov:
Bacon ipsum dolor amet landjaeger bacon bresaola… jo tak.
Jeg skrev til mit skærmbillede:
Nu var Christine tilbage på sagen, men hun har nok brugt ca. 14 millisekunder for lidt på at kigge på det skærmbillede, jeg vedhæftede. I hvert fald misser hun pointen totalt:
Alle Nordnets kunder er åbenbart forskellige kødprodukter…
Hvad gør man så?
Jeg har tweetet:
Jeg har forsøgt at LinkedIn-connecte og gøre opmærksom på problemet:
Indtil videre er det ikke lykkes mig at komme i kontakt med en person hos Nordnet med kompetencer eller vilje til at rette op på fejlen.
Det er ikke engang lykkes mig at komme i kontakt med en person hos Nordnet med kompetencer til at give en personen med kompetencer til at løse problemet besked om problemet.
Jeg tror, at det her viser noget om mig og noget om Nordnet:
Jeg er villig til at bruge (for) lang tid på services/problemer, der irriterer mig.
Nordnet er – som mange andre større virksomheder – villig til at bruge så få ressourcer på kundeservice, at de risikerer at rigtige problemer ikke når frem til de rigtige personer.
Opdatering 19. maj 2021
I går eftermiddags forsøgte jeg at gøre Nordnets landechef for Danmark, Anne Buchardt opmærksom på problemet. Hun satte Rasmus Järborg, som er Chief Product Officer på sagen, og Rasmus satte en udvikler til at kigge nærmere på det.
Det hjalp!
Mindre end et døgn senere havde Nordnet fikset problemet. Suveræn – men meget dyr – kundeservice.
Hurtig reaktion fra AnneFremragende kundesupport fra Rasmus
Fordi jeg blev kontaktet af nogle flinke skoleansatte, der er i gang med at lette deres hverdag med AULA, har jeg opdateret mit Python-script med eksempler på, hvad man kan lave i AULA, uden rent faktisk at pege sin browser på AULA.
Det nye eksempel (eksempel 6) viser, hvordan man kan oprette en kalenderbegivenhed i AULA (de gamle eksempler nøjedes med at læse data fra systemet).
Og til de nysgerrige: Nej, man kan ikke (umiddelbart) oprette begivenheder med Javascript i “beskrivelsen”. Ja, man kan godt lave begivenheder med inline-css i beskrivelsen, så begivenheder ser ret specielle ud, rent visuelt.
# aula.py
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
''' An example of how to log in to the Danish LMS Aula (https://aula.dk) and
extract data from the API. Could be further developed to also submit data and/or to
create your own web or terminal interface(s) for Aula.'''
# Imports
import requests # Perform http/https requests
from bs4 import BeautifulSoup # Parse HTML pages
import json # Needed to print JSON API data
# User info
user = {
'username': '',
'password': ''
}
# Start requests session
session = requests.Session()
# Get login page
url = 'https://login.aula.dk/auth/login.php?type=unilogin'
response = session.get(url)
# Login is handled by a loop where each page is first parsed by BeautifulSoup.
# Then the destination of the form is saved as the next url to post to and all
# inputs are collected with special cases for the username and password input.
# Once the loop reaches the Aula front page the loop is exited. The loop has a
# maximum number of iterations to avoid an infinite loop if something changes
# with the Aula login.
counter = 0
success = False
while success == False and counter < 10:
try:
# Parse response using BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")
# Get destination of form element (assumes only one)
url = soup.form['action']
# If form has a destination, inputs are collected and names and values
# for posting to form destination are saved to a dictionary called data
if url:
# Get all inputs from page
inputs = soup.find_all('input')
# Check whether page has inputs
if inputs:
# Create empty dictionary
data = {}
# Loop through inputs
for input in inputs:
# Some inputs may have no names or values so a try/except
# construction is used.
try:
# Login takes place in single input steps, which
# is the reason for the if/elif construction
# Save username if input is a username field
if input['name'] == 'username':
data[input['name']] = user['username']
# Save password if input is a password field
elif input['name'] == 'password':
data[input['name']] = user['password']
# For employees the login procedure has an additional field to select a role
# If an employee needs to login in a parent role, this value needs to be changed
elif input['name'] == 'selected-aktoer':
data[input['name']] = "MEDARBEJDER_EKSTERN"
# For all other inputs, save name and value of input
else:
data[input['name']] = input['value']
# If input has no value, an error is caught but needs no handling
# since inputs without values do not need to be posted to next
# destination.
except:
pass
# If there's data in the dictionary, it is submitted to the destination url
if data:
response = session.post(url, data=data)
# If there's no data, just try to post to the destination without data
else:
response = session.post(url)
# If the url of the response is the Aula front page, loop is exited
if response.url == 'https://www.aula.dk:443/portal/':
success = True
# If some error occurs, try to just ignore it
except:
pass
# One is added to counter each time the loop runs independent of outcome
counter += 1
# Login succeeded without an HTTP error code and API requests can begin
if success == True and response.status_code == 200:
print("Login lykkedes")
# All API requests go to the below url
# Each request has a number of parameters, of which method is always included
# Data is returned in JSON
url = 'https://www.aula.dk/api/v12/'
### First API request. This request must be run to generate correct correct cookies for subsequent requests. ###
params = {
'method': 'profiles.getProfilesByLogin'
}
# Perform request, convert to json and print on screen
response_profile = session.get(url, params=params).json()
print(json.dumps(response_profile, indent=4))
### Second API request. This request must be run to generate correct correct cookies for subsequent requests. ###
params = {
'method': 'profiles.getProfileContext',
'portalrole': 'guardian', # 'guardian' for parents (or other guardians), 'employee' for employees
}
# Perform request, convert to json and print on screen
response_profile_context = session.get(url, params=params).json()
print(json.dumps(response_profile_context, indent=4))
# Loop to get institutions and children associated with profile and save
# them to lists
institutions = []
institution_profiles = []
children = []
for institution in response_profile_context['data']['institutions']:
institutions.append(institution['institutionCode'])
institution_profiles.append(institution['institutionProfileId'])
for child in institution['children']:
children.append(child['id'])
children_and_institution_profiles = institution_profiles + children
### Third example API request, uses data collected from second request ###
params = {
'method': 'notifications.getNotificationsForActiveProfile',
'activeChildrenIds[]': children,
'activeInstitutionCodes[]': institutions
}
# Perform request, convert to json and print on screen
notifications_response = session.get(url, params=params).json()
print(json.dumps(notifications_response, indent=4))
### Fourth example API request, only succeeds when the third has been run before ###
params = {
'method': 'messaging.getThreads',
'sortOn': 'date',
'orderDirection': 'desc',
'page': '0'
}
# Perform request, convert to json and print on screen
response_threads = session.get(url, params=params).json()
print(json.dumps(response_threads, indent=4))
### Fifth example. getAllPosts uses a combination of children and instituion profiles. ###
params = {
'method': 'posts.getAllPosts',
'parent': 'profile',
'index': "0",
'institutionProfileIds[]': children_and_institution_profiles,
'limit': '10'
}
# Perform request, convert to json and print on screen
response_threads = session.get(url, params=params).json()
print(json.dumps(response_threads, indent=4))
### Sixth example. Posting a calender event. ###
params = (
('method', 'calendar.createSimpleEvent'),
)
# Manually setting the cookie "profile_change". This probably has to do with posting as a parent.
session.cookies['profile_change'] = '2'
# Csrfp-token is manually added to session headers.
session.headers['csrfp-token'] = session.cookies['Csrfp-Token']
data = {
'title': 'This is a test',
'description': '<p>A really nice test.</p>',
'startDateTime': '2021-05-18T14:30:00.0000+02:00',
'endDateTime': '2021-05-18T15:00:00.0000+02:00',
'startDate': '2021-05-17',
'endDate': '2021-05-17',
'startTime': '12:00:19',
'endTime': '12:30:19',
'id': '',
'institutionCode': response_profile['data']['profiles'][0]['institutionProfiles'][0]['institutionCode'],
'creatorInstProfileId': response_profile['data']['profiles'][0]['institutionProfiles'][0]['id'],
'type': 'event',
'allDay': False,
'private': False,
'primaryResource': {},
'additionalLocations': [],
'invitees': [],
'invitedGroups': [],
'invitedGroupIds': [],
'invitedGroupHomes': [],
'responseRequired': True,
'responseDeadline': None,
'resources': [],
'attachments': [],
'oldStartDateTime': '',
'oldEndDateTime': '',
'isEditEvent': False,
'addToInstitutionCalendar': False,
'hideInOwnCalendar': False,
'inviteeIds': [],
'additionalResources': [],
'pattern': 'never',
'occurenceLimit': 0,
'weekdayMask': [
False,
False,
False,
False,
False,
False,
False
],
'maxDate': None,
'interval': 0,
'lessonId': '',
'noteToClass': '',
'noteToSubstitute': '',
'eventId': '',
'isPrivate': False,
'resourceIds': [],
'additionalLocationIds': [],
'additionalResourceIds': [],
'attachmentIds': []
}
response_calendar = session.post(url, params=params, json=data).json()
print(json.dumps(response_calendar, indent=4))
# Login failed for some unknown reason
else:
print("Noget gik galt med login")
{% extends "stocks/base.html" %}
{% load static %}
{% block title %}ETF'er og fonde med aktiebeskatning 2021{% endblock %}
{% block content %}{% spaceless %}
<h1>ETF'er og fonde med aktiebeskatning 2021</h1>
<p>Du har læst om, <a href="https://www.nordnet.dk/blog/nye-regler-for-beskatning-af-investeringsfonde/">at aktiebaserede ETF'er og udenlandske investeringsfonde fra 2020 beskattes som aktieindkomst og ikke længere som kapitalindkomst</a>.</p>
<p>Du har endda fundet <a href="https://skat.dk/getfile.aspx?id=145013&type=xlsx">det fine regneark, der viser aktiebaserede investeringsselskaber</a> på <a href="https://skat.dk/skat.aspx?oid=2244641">skat.dk</a>.</p>
<p>Men det er godt nok svært for dig at få overblik over, hvilke af papirerne du overhovedet kan købe som almindelig hobby-/cryptoinvestor, og at sammenligne omkostninger, ÅOP og hvad det ellers hedder, for at finde det rigtige køb.</p>
<p>Her er et forsøg på at løse dit (og mit) problem. Data kommer fra <a href="https://skat.dk/getfile.aspx?id=145013&type=xlsx">det fine regneark</a> og har samme fejl og mangler, men er suppleret med nyttige informationer og links.</p>
<p><a href="#forbehold">Du kan læse om forbehold nederst på siden</a> og du kan <a href="https://helmstedt.dk/2021/03/etfer-og-fonde-med-aktiebeskatning-2021/">læse om hvordan siden er lavet på min blog</a>.</p>
<p><strong>Vis til salg hos:</strong>
<form id="prefs">
<input type="radio" id="nordnetsaxo" name="filter" value="nordnetsaxo"{% if request.GET.filter == "nordnetsaxo" or not request.GET.filter %} checked{% endif %}>
<label title="Værdipapirer til salg hos Nordnet, Saxo Bank eller begge steder" for="nordnetsaxo">Nordnet og/eller Saxo Bank</label>
<input type="radio" id="nordnet" name="filter" value="nordnet"{% if request.GET.filter == "nordnet" %} checked{% endif %}>
<label title="Værdipapirer til salg hos Nordnet" for="nordnet">Nordnet</label>
<input type="radio" id="saxo" name="filter" value="saxo"{% if request.GET.filter == "saxo" %} checked{% endif %}>
<label title="Værdipapirer til salg hos Saxo Bank" for="saxo">Saxo Bank</label>
<input type="radio" id="ikkenordnetsaxo" name="filter" value="ikkenordnetsaxo"{% if request.GET.filter == "ikkenordnetsaxo" %} checked{% endif %}>
<label title="Værdipapirer, der hverken er til salg hos Nordnet eller Saxo Bank" for="ikkenordnetsaxo">Ikke Nordnet og/eller Saxo</label>
<input type="radio" id="alle" name="filter" value="alle"{% if request.GET.filter == "alle" %} checked{% endif %}>
<label title="Alle værdipapirer, både dem der kan købes hos Nordnet/Saxo Bank og de, der ikke kan" for="alle">Hele pivtøjet</label>
</form>
</p>
<table>
<tr>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "-name" %}name{% else %}-name{% endif %}">Navn</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "isin" %}-isin{% else %}isin{% endif %}">Isin</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "morningstar_aop" %}-morningstar_aop{% else %}morningstar_aop{% endif %}">Løbende omkostninger</a></th>
<th><a href="{% url 'stocks_index' %}?sort={% if request.GET.sort == "nordnet_aop" %}-nordnet_aop{% else %}nordnet_aop{% endif %}">ÅOP</a></th>
<th>Investorinformation</th>
<th>Morningstar</th>
<th>Nordnet</th>
<th>Saxo</th>
</tr>
{% for stock in stocks %}
<tr>
<td>{{ stock.name }}</td>
<td>{{ stock.isin }}</td>
<td>{% if stock.morningstar_aop %}{{ stock.morningstar_aop }}%{% endif %}</td>
<td>{% if stock.nordnet_aop %}{{ stock.nordnet_aop }}%{% endif %}</td>
<td>{% if stock.nordnet_prospect %}<a href="{{ stock.nordnet_prospect }}">Info</a>{% elif stock.morningstar_prospect %}<a href="{{ stock.morningstar_prospect }}">Info</a>{% endif %}</td>
<td>{% if stock.morningstar_url %}<a href="{{ stock.morningstar_url }}">Link</a>{% endif %}</td>
<td>{% if stock.nordnet_url %}<a href="{{ stock.nordnet_url }}">Link</a>{% endif %}</td>
<td>{% if stock.saxo_url %}<a href="{{ stock.saxo_url }}">Link</a>{% endif %}</td>
</tr>
{% endfor %}
</table>
<a name="forbehold"></a>
<h2>Forbehold</h2>
<p>Alt hvad du læser på denne side er løgn og fiktion fra ende til anden og har ingen relation til virkeligheden. Hvis du kunne finde på at læse indholdet, som om det omhandlede værdipapirer, eller at købe, sælge eller tage dig af din personlige hygiejne med værdipapirer på grund af indholdet på denne side, er det fuldstændig et hundrede procent på eget ansvar. Alt hvad der findes på siden er fejlbehæftet, forældet og lavet af en uduelig amatør uden forstand på noget som helst. Du skal regne med, at alle links fører til nogle andre værdipapirer, end man skulle tro, og at de værdipapirer som står til salg et sted sikkert ikke sælges der - og omvendt. Alle oplysninger om løbende omkostninger og ÅOP er fundet ved hjælp af hønebingo og dermed så godt som tilfældige.</p>
{% endspaceless %}{% endblock %}
I like to document my doings and for about 15 years I’ve been documenting the books I have read. First in Notepad, then in Excel and finally in Python and Django with a database somewhere in the background. I am amazed what experts help amateurs achieve.
This post explains the proces of collecting data about my reads in little detail and in too great detail the code behind the page.
Some books of 2020Statistics
Finding information ONLINE
Most data was crawled from Danish library ressources, Goodreads and Wikpedia with varying success. A lot was entered manually, especially with works in translation. I spent hours and hours being pedantic.
Even though librarians have been managing data longer than anyone else on the planet, there is no autoritative relational database where you can look up when some book by some author was first published and when the first Danish language version came out. In defence of librarians, many writers go to great lengths to make data management on books hard (one example is the genre “non-fiction novel” used by Spanish writer Javier Cercas).
The mysteries of Goodreads
I was mystified by the ability of Goodreads to place study guides and commentary to great works of literature first in their search results (and many more strange things) and terrified by Google displaying available nowhere else I could find on the web author birthdays on top of search results .
Also, Goodreads magically has editions of books that are older than when Goodreads claims the book was first published.
Goodreads: When what you’re searching for is nowhere near the first hit
How does this autocomplete work?
I wonder?
First published on April 5, but first listed edition is from March 23. Huh?
Adding books
After crawling for data, I made a form to add new books:
Step 1. Push “Look up”PROFIT!
The form
This was a breeze in Django. Here’s forms.py:
from django.forms import ModelForm
from books.models import Author, Title, Read
class AuthorForm(ModelForm):
class Meta:
model = Author
fields = ['first_name', 'last_name','gender','country','biography','birth_date','data_quality']
class TitleForm(ModelForm):
class Meta:
model = Title
fields = ['title','genre','read_language','original_language','publisher','isbn','published_date','first_published','cover_url','ereolen_url','biblo_dk_url','good_reads_url','pages','original_title']
class ReadForm(ModelForm):
class Meta:
model = Read
fields = ['date']
The view:
And here’s the logic from views.py (I probably shouldn’t uncritically be saving cover URLs found on the internet to my server, but):
# Add a read to database
@login_required
def add_read(request):
book_saved = False
author_form = AuthorForm()
title_form = TitleForm()
read_form = ReadForm()
if request.method == 'POST': # AND SUBMIT BUTTON
author_form = AuthorForm(request.POST)
title_form = TitleForm(request.POST)
read_form = ReadForm(request.POST)
if author_form.is_valid() and title_form.is_valid() and read_form.is_valid():
author_data = author_form.cleaned_data
title_data = title_form.cleaned_data
read_data = read_form.cleaned_data
existing_author = False
existing_title = False
# AUTHOR LOGIC - MAY ALSO MODIFY TITLE DATA
# Check if already exist
try:
author = Author.objects.get(first_name=author_data['first_name'], last_name=author_data['last_name'])
existing_author = True
context['existing_author'] = existing_author
except:
if 'lookup' in request.POST:
if any(not value for value in author_data.values()):
author_data, title_data = get_author(author_data, title_data) # try to fetch data
# TITLE LOGIC - MAY ALSO MODIFY AUTHOR DATA
# Check if title already exists, will only work is author has been found. (Book is re-read)
try:
if author:
title = Title.objects.get(authors=author, title=title_data['title'])
existing_title = True
context['existing_title'] = True
except:
if 'lookup' in request.POST:
if any(not value for value in title_data.values()):
title_data, author_data = get_title(title_data, author_data) # try to fetch data
# Render form with data from database or collected data
if 'lookup' in request.POST:
if not existing_author:
author_form = AuthorForm(author_data)
else:
author_form = AuthorForm(instance=author)
if not existing_title:
title_form = TitleForm(title_data)
else:
title_form = TitleForm(instance=title)
# Save data
if 'save' in request.POST:
if not existing_author:
author = author_form.save()
if not existing_title:
title = title_form.save()
title.authors.add(author)
if title.cover_url:
file = requests.get(title.cover_url, stream=True)
save_location = settings.STATIC_ROOT + "books/covers/"
if '.jpg' in title.cover_url:
ending = '.jpg'
elif '.png' in title.cover_url:
ending = '.png'
elif '.webp' in title.cover_url:
ending = '.webp'
else:
ending = '.jpg'
id = title.id
filename = str(id) + ending
with open(save_location+filename, 'wb') as f:
file.raw.decode_content = True
shutil.copyfileobj(file.raw, f)
title.cover_filename = filename
title.save()
#create thumbnail
image = Image.open(save_location+filename).convert("RGB")
maxsize = 150, 150
image.thumbnail(maxsize, Image.ANTIALIAS)
image.save(save_location+"150/"+str(id)+".webp", "WEBP")
save_read = read_form.save(commit=False)
save_read.title = title
save_read = read_form.save()
# Set save variable to True and display empty form
book_saved = True
author_form = AuthorForm()
title_form = TitleForm()
read_form = ReadForm()
context = {'author_form': author_form, 'title_form': title_form, 'read_form': read_form, 'book_saved': book_saved}
return render(request, 'books/add.html', context)
The helper function
If you are a really curious and patient individual, you may be wondering about the get_author and get_title functions. You are in luck! Here is most of helpers.py which helps me scrape some data from the internet and will probably break in the future:
# HELPER FUNCTIONS #
def numbers_in_string(string):
numbers = sum(character.isdigit() for character in string)
return numbers
def get_author(author_data, title_data):
# WIKIPEDIA
if not author_data['biography']:
if not author_data['country'] == 'da':
url = 'https://en.wikipedia.org/w/index.php?search=intitle%3A%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22&title=Special:Search&profile=advanced&fulltext=1&ns0=1'
else:
url = 'https://da.wikipedia.org/w/index.php?search=intitle%3A%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22&title=Special:Search&profile=advanced&fulltext=1&ns0=1'
else:
url = author_data['biography']
author_request = requests.get(url)
if author_request.status_code == 200:
soup = BeautifulSoup(author_request.text, "lxml")
try:
first_result = soup.find('div', {'class':'mw-search-result-heading'}).a['href']
if not author_data['country'] == 'da':
result_page = 'https://en.wikipedia.org' + first_result
else:
result_page = 'https://da.wikipedia.org' + first_result
page_request = requests.get(result_page)
soup = BeautifulSoup(page_request.text, "lxml")
# If not provided, set biography
if not author_data['biography']:
author_data['biography'] = result_page
# If not provided, try to get birth_date
if not author_data['birth_date']:
try:
birthday = soup.find('span', {'class':'bday'}).string
author_data['birth_date'] = datetime.strptime(birthday, '%Y-%m-%d')
except:
try:
birthday = soup.find('th', text="Født").parent.get_text()
# sometimes the above doesn't return a space between year and next info causing a fuckup
try:
find_year = re.search("\d\d\d\d\S", birthday).span()[1]
birthday = birthday[:find_year-1] + " " + birthday[find_year+-1:]
except:
pass
# sometimes even more fuckery
try:
letters_and_numbers_together = re.search("[a-zA-Z]\d", birthday).span()[1]
birthday = birthday[:letters_and_numbers_together-1] + " " + birthday[letters_and_numbers_together-1:]
except:
pass
birthday_date = search_dates(birthday,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
except:
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
if '(født' in text:
birth_mention = text.find('(født')
birth_string = text[birth_mention+1:text.find(")",birth_mention)]
if len(birth_string) < 10: # just a year, probably
year = int(birth_string[5:10])
birthday = date(year,1,1)
author_data['birth_date'] = birthday
else:
birthday_date = search_dates(birth_string,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
break
# If not provided, try to get country
if not author_data['country']:
try:
birthplace = soup.find('div', {'class':'birthplace'}).get_text()
except:
try:
birthplace = soup.find('th', text="Born").parent.get_text()
except:
pass
if birthplace:
country = get_country(birthplace)
if not country:
try:
birthplace = soup.find('th', text="Nationality").find_next_sibling().string
country = get_country(birthplace)
except:
pass
if country:
author_data['country'] = country
if not title_data['original_language']:
if country == 'us' or country == 'sc' or contry == 'ir' or country == 'en' or country == 'au':
country = 'en'
title_data['original_language'] = country
except:
pass
# GENDER
if not author_data['gender']:
request = requests.get('https://gender-api.com/get?name=' + author_data['first_name'] + '&key=vCjPrydWvlRcMxGszD')
response = request.json()
if response['gender'] == 'male':
author_data['gender'] = 'm'
elif response['gender'] == 'female':
author_data['gender'] = 'f'
if not author_data['data_quality']:
if author_data['first_name'] and author_data['last_name'] and author_data['gender'] and author_data['country'] and author_data['birth_date'] and author_data['biography']:
author_data['data_quality'] = 'med'
else:
author_data['data_quality'] = 'bad'
# WIKIPEDIA ALTERNATIVE, ONLY FOR BOOKS READ IN DANISH
if not author_data['biography'] and author_data['first_name'] and title_data['read_language'] == 'da':
url = 'https://litteraturpriser.dk/henv/' + author_data['last_name'][0].lower() + '.htm'
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
links = soup.find_all('a', href=True)
for link in links:
if len(link['href']) > 7:
text = link.get_text().lower()
if author_data['last_name'].lower() + ", " + author_data['first_name'].lower() == text:
url = 'https://litteraturpriser.dk' + link['href']
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
author_data['biography'] = request.url
if not author_data['country']:
author_data['country'] = 'da'
if not author_data['birth_date']:
born = soup.find(text=re.compile('Født'))
if born:
birthday_date = search_dates(born,languages=['da'])[0][1]
author_data['birth_date'] = birthday_date
else:
born = soup.find(text=re.compile('f. '))
birth_year = int(re.search("\d\d\d\d", born).group())
author_data['birth_date'] = date(birth_year,1,1)
if not title_data['original_language']:
title_data['original_language'] = 'da'
break
return author_data, title_data
def get_ereolen(title_data, author_data):
# EREOLEN
soup = ""
if not title_data['ereolen_url']:
if title_data['isbn']:
url = 'https://ereolen.dk/search/ting/' + title_data['isbn'] + '?&facets[]=facet.type%3Aebog'
else:
url = 'https://ereolen.dk/search/ting/' + author_data['first_name'] + " " + author_data['last_name']+ " " + title_data['title'] + '?&facets[]=facet.type%3Aebog'
request = requests.get(url)
try:
search_soup = BeautifulSoup(request.text, "lxml")
links = [a['href'] for a in search_soup.find_all('a', href=True) if '/collection/' in a['href']]
book_request = requests.get('https://ereolen.dk' + links[0])
soup = BeautifulSoup(book_request.text, "lxml")
links = [a['href'] for a in soup.find_all('a', href=True) if '/object/' in a['href']]
# ebooks and audiobook versions
if len(links) == 4:
book_request = requests.get('https://ereolen.dk' + links[0])
soup = BeautifulSoup(book_request.text, "lxml")
# SAVE HIT URL
title_data['ereolen_url'] = 'https://ereolen.dk' + links[0]
except:
pass
else:
book_request = title_data['ereolen_url']
book_request = requests.get(book_request)
soup = BeautifulSoup(book_request.text, "lxml")
if soup:
if not title_data['published_date']:
try:
published = soup.find('div', class_={"field-name-ting-author"}).get_text()
published = int(re.search("[(]\d\d\d\d[)]", published).group()[1:5])
title_data['published_date'] = date(published,1,1)
except:
pass
if not title_data['isbn']:
try:
isbn_tag = soup.find('div', class_={"field-name-ting-details-isbn"})
title_data['isbn'] = isbn_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['publisher']:
try:
publisher_tag = soup.find('div', class_={"field-name-ting-details-publisher"})
title_data['publisher'] = publisher_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['pages']:
try:
page_tag = soup.find('div', class_={"field-name-ting-details-extent"})
title_data['pages'] = int(page_tag.find('div', class_={"field-items"}).get_text().replace(" sider",""))
except:
pass
if not title_data['original_title']:
try:
original_title_tag = soup.find('div', class_={"field-name-ting-details-source"})
title_data['original_title'] = original_title_tag.find('div', class_={"field-items"}).get_text()
except:
pass
if not title_data['cover_url']:
covers = [img['src'] for img in soup.find_all('img') if '/covers/' in img['src']]
title_data['cover_url'] = covers[0][:covers[0].find("?")]
return title_data, author_data
def get_bibliotek_dk(title_data, author_data):
search_url = 'https://bibliotek.dk/da/search/work?search_block_form=phrase.creator%3D%22' + author_data['first_name'] + " " + author_data['last_name'] + '%22+and+phrase.title%3D%22' + title_data['title'] + '%22&select_material_type=bibdk_frontpage&op=S%C3%B8g&n%2Famaterialetype%5Bterm.workType%253D%2522literature%2522%5D=term.workType%253D%2522literature%2522&year_op=%2522year_eq%2522&year_value=&form_id=search_block_form&sort=rank_main_title&page_id=bibdk_frontpage'
request = requests.get(search_url)
soup = BeautifulSoup(request.text, "lxml")
hits = soup.find_all('div', {'class':'work mobile-page'})
if not hits:
url = 'https://bibliotek.dk/da/search/work?search_block_form=' + author_data['first_name'] + " " + author_data['last_name'] + " " + title_data['title'] +'&select_material_type=bibdk_frontpage%2Fbog&op=S%C3%B8g&n%2Famaterialetype%5Bterm.workType%253D%2522literature%2522%5D=term.workType%253D%2522literature%2522&year_op=%2522year_eq%2522&year_value=&form_build_id=form-TQ8TlT3HGFiKXyvz6cCFaiuTMZKimuHMF-p4q1Mb8ZI&form_id=search_block_form&sort=rank_main_title&page_id=bibdk_frontpage#content'
request = requests.get(url)
soup = BeautifulSoup(request.text, "lxml")
hits = soup.find_all('div', {'class':'work mobile-page'})
for hit in hits:
id = hit['id']
title = hit.find('h2', {'class':'searchresult-work-title'}).get_text()
author = hit.h3.get_text()
if title_data['title'].lower() in title.lower() or title.lower() in title_data['title'].lower() or len(hits) == 1:
if 'basis' in id:
link = id.replace("basis","-basis:")
elif 'katalog' in id:
link = id.replace("katalog","-katalog:")
biblo_url = 'https://bibliotek.dk/da/work/' + link
request = requests.get(biblo_url)
if not title_data['biblo_dk_url']:
title_data['biblo_dk_url'] = biblo_url
soup = BeautifulSoup(request.text, "lxml")
if not title_data['cover_url']:
try:
img = soup.find('div', {'class':'bibdk-cover'}).img['src'].replace("/medium/","/large/")
img = img[:img.find("?")]
title_data['cover_url'] = img
except:
pass
book_data = soup.find('div', {'class':'manifestation-data'})
if not title_data['pages']:
try:
pages = book_data.find('div', {'class':'field-name-bibdk-mani-format'}).find('span', {'class':'openformat-field'}).string.strip()
pages = pages[:pages.find(" ")]
pages = int(pages)
title_data['pages'] = pages
except:
pass
if not title_data['publisher']:
try:
publisher = book_data.find('div', {'class':'field-name-bibdk-mani-publisher'}).find('span', {'property':'name'}).string
title_data['publisher'] = publisher
except:
pass
if not title_data['published_date'] or not title_data['first_published']:
try:
first_published = book_data.find('div', {'class':'field-name-bibdk-mani-originals'}).find('span', {'class':'openformat-field'}).string.strip()
published = int(re.search("\d\d\d\d", first_published).group())
if not title_data['published_date']:
title_data['published_date'] = date(published,1,1)
if not title_data['first_published'] and title_data['read_language'] == 'da' and title_data['original_language'] == 'da':
title_data['first_published'] = date(published,1,1)
except:
try:
pub_year = int(book_data.find('div', {'class':'field-name-bibdk-mani-pub-year'}).find('span', {'class':'openformat-field'}).string.strip())
title_data['published_date'] = date(pub_year,1,1)
if title_data['read_language'] == 'da' and title_data['original_language'] == 'da':
try:
edition = book_data.find('div', {'class':'field-name-bibdk-mani-edition'}).find('span', {'class':'openformat-field'}).string.strip()
if edition == "1. udgave":
title_data['first_published'] = date(pub_year,1,1)
except:
pass
except:
pass
break
return title_data, author_data
def get_goodreads(title_data, author_data):
if not title_data['good_reads_url']:
searchterm = author_data['first_name'] + " " + author_data['last_name'] + " " + title_data['title']
search_url = 'https://www.goodreads.com/search?utf8=✓&q=' + searchterm + '&search_type=books'
response = requests.get(search_url)
search_soup = BeautifulSoup(response.text, "lxml")
all_results = search_soup.find_all('tr', {'itemtype':'http://schema.org/Book'})
if not all_results:
search_url = 'https://www.goodreads.com/search?utf8=✓&q=' + title_data['title'] + '&search_type=books'
response = requests.get(search_url)
search_soup = BeautifulSoup(response.text, "lxml")
all_results = search_soup.find_all('tr', {'itemtype':'http://schema.org/Book'})
if all_results:
good_match = False
#exact match
for result in all_results:
gr_author = result.find('span', {'itemprop':'author'}).get_text().strip()
gr_author = gr_author.replace(' (Goodreads Author)','')
if " " in gr_author:
gr_author = gr_author.replace(" "," ")
elif " " in gr_author:
gr_author = gr_author.replace(" "," ")
gr_title = result.find('a', {'class':'bookTitle'})
gr_title_string = gr_title.get_text().strip()
title_url = gr_title['href']
if gr_title_string.lower() == title_data['title'].lower() and gr_author.lower() == author_data['first_name'].lower() + " " + author_data['last_name'].lower():
good_match = True
break
if good_match == True:
url = 'https://www.goodreads.com' + title_url
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
else:
links = search_soup.find_all('a', href=True)
books = [a['href'] for a in links if '/book/show/' in a['href']]
for book in books:
if not 'summary' in book and not 'analysis' in book and not 'lesson-plan' in book and not 'sidekick' in book and not 'teaching-with' in book and not 'study-guide' in book and not 'quicklet' in book and not 'lit-crit' in book and not author_data['last_name'].lower() in book:
url = 'https://www.goodreads.com' + book
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
heading = soup.find('h1', {'id': 'bookTitle'}).string
break
else:
url = title_data['good_reads_url']
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
if not title_data['good_reads_url']:
if '?' in url:
url = url[:url.rfind("?")]
title_data['good_reads_url'] = url
if not title_data['cover_url']:
try:
title_data['cover_url'] = soup.find('img', {"id" : "coverImage"})['src'].replace("compressed.","")
except:
pass
details = soup.find('div', {"id" : "details"})
details_text = details.get_text()
if not title_data['published_date']:
possible_dates = details.find_all('div', attrs={'class':'row'})
for item in possible_dates:
published_date = item.find(text=re.compile("Published"))
if published_date:
published_date = published_date.strip()
numbers = numbers_in_string(published_date)
if numbers > 4:
title_data['published_date'] = search_dates(published_date,languages=['en'])[0][1]
elif numbers == 4:
year = int(re.search("\d\d\d\d", published_date).group())
title_data['published_date'] = date(year,1,1)
if not title_data['first_published']:
try:
first_published = details.find('nobr').string.strip()
numbers = numbers_in_string(first_published)
if numbers > 4:
title_data['first_published'] = search_dates(first_published,languages=['en'])[0][1]
elif numbers == 4:
year = int(re.search("\d\d\d\d", first_published).group())
title_data['first_published'] = date(year,1,1)
except:
pass
if not title_data['pages']:
try:
pages = details.find('span', {'itemprop': 'numberOfPages'}).string
title_data['pages'] = int(pages[:pages.find(" ")])
except:
pass
if not title_data['publisher']:
try:
by_location = details_text.find("by ")
title_data['publisher'] = details_text[by_location+3:details_text.find("\n", by_location)]
except:
pass
if not title_data['isbn']:
try:
isbn = re.search("\d\d\d\d\d\d\d\d\d\d\d\d\d", details_text).group()
title_data['isbn'] = isbn
except:
try:
isbn = re.search("\d\d\d\d\d\d\d\d\d\d", details_text).group()
title_data['isbn'] = isbn
except:
pass
if not title_data['original_title'] and title_data['read_language'] != title_data['original_language']:
try:
parent = details.find('div', text="Original Title").parent
original_title = parent.find('div', {'class':'infoBoxRowItem'}).string
title_data['original_title'] = original_title
except:
pass
return title_data, author_data
def get_title(title_data, author_data):
if title_data['read_language'] == 'da':
title_data, author_data = get_ereolen(title_data, author_data)
title_data, author_data = get_bibliotek_dk(title_data, author_data)
title_data, author_data = get_goodreads(title_data, author_data)
#cover from ereolen, mofibo, saxo
# danish library request
else:
title_data, author_data = get_goodreads(title_data, author_data)
return title_data, author_data
The views.py function for the front page is short and sweet:
def index(request):
context = {}
context['request'] = request
reads = Read.objects.order_by('-date__year', 'date__month','sort_order','id').select_related('title')
context['reads'] = reads
context['months'] = [[i, calendar.month_abbr[i]] for i in range(1,13)]
return render(request, 'books/index.html', context)
And, while longer, I think the template loop is nice too, (although there is that clumsy nested loop):
{% regroup reads by date.year as years_list %}
{% for year, readings in years_list %}
<h2>{{ year }}</h2>
{% if year == 2015 %}
<p>I was on paternity leave most of this year which gave me time to read a lot, but not the mental surplus to register by month. This year I bought a Kindle which re-kindled (durr) my interest in reading.</p>
{% elif year == 2004 %}
<p>I was working in England from around September 2003 to February 2004. This gave me time to read a lot, but not the computer access at home necessary to register my reads precisely.</p>
{% elif year == 2003 %}
<p>The year I began registering my reads.</p>
{% elif year == 2002 %}
<p>This - and all years before - is from memory in 2003, so not really precise.</p>
{% endif %}
{% regroup readings by date.month as months_list %}
{% if year > 2004 and not year == 2015 %}
<div class="grid reads">
{% for month in months %}
<div class="flex">
<div>{{ month.1 }}</div>
{% for mon, reads in months_list %}
{% if mon == month.0 %}
{% for read in reads %}
<a title="{{ read.title }}" href="{% url 'books_book' read.title.id %}"><img class="frontcover" loading="lazy" src="{% static 'books/covers/150/' %}{{ read.title.id }}.webp"></a>
{% endfor %}
{% endif %}
{% endfor %}
</div>
{% endfor %}
</div>
{% else %}
{% for read in readings %}
<a href="{% url 'books_book' read.title.id %}"><img class="frontcover" loading="lazy" src="{% static 'books/covers/150/' %}{{ read.title.id }}.webp"></a>
{% endfor %}
{% endif %}
Here’s the views.py function which could probably be sped up if I had any idea how (which I don’t):
def statistics(request):
context = {}
# All reads, used for lots of charts
reads = Read.objects.order_by('date__year').select_related('title').prefetch_related('title__authors')
context['reads'] = reads
# Books per year chart queryset
books_pages_per_year = Read.objects.values('date__year').annotate(Count('id'), Sum('title__pages'), Avg('title__pages')).order_by('date__year')
context['books_pages_per_year'] = books_pages_per_year
# Prepare year, value-dictionaries
genre_structure = {} # fiction vs. non-fiction
author_gender_structure = {} # male vs. female
author_birth_structure = {} # median age of authors
read_language_structure = {} # language of read
original_language_structure = {} # original language of read
language_choices = dict(Title.LANGUAGE_CHOICES) # look up dict for original languages
author_country_structure = {} # country of author
country_choices = dict(Author.COUNTRY_CHOICES)
book_age_structure = {} # median age of books
for read in reads:
year_of_read = read.date.year
# Put year keys in dictionaries
if not year_of_read in genre_structure: # check one = check all
genre_structure[year_of_read] = []
author_gender_structure[year_of_read] = []
author_birth_structure[year_of_read] = []
read_language_structure[year_of_read] = []
original_language_structure[year_of_read] = []
author_country_structure[year_of_read] = []
book_age_structure[year_of_read] = []
# Put values in dictionaries
if read.title.read_language == 'da' or read.title.read_language == 'en':
read_language_structure[year_of_read].append(read.title.read_language)
if read.title.original_language:
original_language_structure[year_of_read].append(language_choices[read.title.original_language])
if read.title.genre:
genre_structure[year_of_read].append(read.title.genre)
if read.title.first_published:
book_age_structure[year_of_read].append(read.title.first_published.year)
for author in read.title.authors.all():
if author.gender:
author_gender_structure[year_of_read].append(author.gender)
if author.birth_date:
author_birth_structure[year_of_read].append(author.birth_date.year)
if author.country:
author_country_structure[year_of_read].append(country_choices[author.country])
# Prepare datasets for charts
genres = {}
for year, genre_list in genre_structure.items():
number_of_titles = len(genre_list)
number_of_fiction_titles = sum(1 for genre in genre_list if genre == 'fi')
fiction_percentage = int(number_of_fiction_titles/number_of_titles*100)
non_fiction_percentage = 100 - fiction_percentage
genres[year] = [fiction_percentage, non_fiction_percentage]
context['genres'] = genres
median_author_age = {}
for year, birthyears in author_birth_structure.items():
birthyears = sorted(birthyears)
median_birthyear = birthyears[len(birthyears) // 2]
median_author_age[year] = year - median_birthyear
context['median_author_age'] = median_author_age
author_genders = {}
for year, genders in author_gender_structure.items():
number_of_authors = len(genders)
males = sum(1 for gender in genders if gender == 'm')
male_percentage = int(males/number_of_authors*100)
female_percentage = 100 - male_percentage
author_genders[year] = [male_percentage, female_percentage]
context['author_genders'] = author_genders
read_languages = {}
for year, languages in read_language_structure.items():
number_of_languages = len(languages)
danish = sum(1 for language in languages if language == 'da')
danish_percentage = int(danish / number_of_languages * 100)
english_percentage = 100 - danish_percentage
read_languages[year] = [danish_percentage, english_percentage]
context['read_languages'] = read_languages
original_languages = []
original_languages_years = []
for year, languages in original_language_structure.items():
if not year in original_languages_years:
original_languages_years.append(year)
for lang in languages:
if lang not in original_languages:
original_languages.append(lang)
original_languages_template = {}
for language in original_languages:
original_languages_template[language] = []
for year in original_languages_years:
count_of_language_in_year = sum(1 for lang in original_language_structure[year] if language == lang)
original_languages_template[language].append(count_of_language_in_year)
context['original_languages_template'] = original_languages_template
context['original_languages_years'] = original_languages_years
author_countries = []
author_countries_years = []
for year, countries in author_country_structure.items():
if not year in author_countries_years:
author_countries_years.append(year)
for country in countries:
if country not in author_countries:
author_countries.append(country)
author_countries_template = {}
for country in author_countries:
author_countries_template[country] = []
for year in author_countries_years:
count_of_country_in_year = sum(1 for countr in author_country_structure[year] if country == countr)
author_countries_template[country].append(count_of_country_in_year)
context['author_countries_template'] = author_countries_template
context['author_countries_years'] = author_countries_years
median_book_age = {}
for year, publish_years in book_age_structure.items():
publish_years = sorted(publish_years)
# account for no data in years
if len(publish_years) >= 2:
median_publish_year = publish_years[len(publish_years) // 2]
elif len(publish_years) == 1:
median_publish_year = publish_years[0]
else:
median_publish_year = 0
median_book_age[year] = year - median_publish_year
context['median_book_age'] = median_book_age
return render(request, 'books/statistics.html', context)
And a template example:
<div>
<h2>Reads per year</a>
<canvas id="books_per_year"></canvas>
</div>
<script>
var ctx = document.getElementById('books_per_year').getContext('2d');
var myChart = new Chart(ctx, {
type: 'bar',
data: {
labels: [{% for year in books_pages_per_year %}{% if not forloop.last %}{{ year.date__year }}, {% else %}{{ year.date__year }}{% endif %}{% endfor %}],
datasets: [{
label: 'Read',
data: [{% for year in books_pages_per_year %}{% if not forloop.last %}{{ year.id__count }}, {% else %}{{ year.id__count }}{% endif %}{% endfor %}],
backgroundColor: 'rgba(255, 99, 132, 0.2)',
borderColor: 'rgba(255, 99, 132, 1)',
borderWidth: 1
}]
},
options: {
tooltips: {
callbacks: {
label: function(tooltipItem, data) {
return data.datasets[tooltipItem.datasetIndex].label + ': ' + tooltipItem.value + ' books';
}
}
},
legend: {
display: false
},
responsive: true,
scales: {
yAxes: [{
ticks: {
beginAtZero: true
}
}]
}
}
});
</script>
Wallnots Twitter-bot finder delte artikler fra Politiken og Zetland på Twitter og deler dem med verden. Det fungerer sådan her:
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import date
from datetime import timedelta
import json
import time
import random
from TwitterAPI import TwitterAPI
from nested_lookup import nested_lookup
# CONFIGURATION #
# List to store articles to post to Twitter
articlestopost = []
# Search tweets from last 3 hours
now = datetime.utcnow()
since_hours = 3
since = now - timedelta(hours=since_hours)
since_string = since.strftime("%Y-%m-%dT%H:%M:%SZ")
# Search configuration
# https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent
# https://github.com/twitterdev/Twitter-API-v2-sample-code/tree/master/Recent-Search
tweet_fields = "tweet.fields=entities"
media_fields = "media.fields=url"
max_results = "max_results=100"
start_time = "start_time=" + since_string
# Twitter API login
client_key = ''
client_secret = ''
access_token = ''
access_secret = ''
api = TwitterAPI(client_key, client_secret, access_token, access_secret)
bearer_token = ''
# POLITIKEN #
# Run search
query = 'politiken.dk/del'
url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
query, tweet_fields, media_fields, max_results, start_time
)
headers = {"Authorization": "Bearer {}".format(bearer_token)}
response = requests.request("GET", url, headers=headers)
json_response = response.json()
urllist = list(set(nested_lookup('expanded_url', json_response)))
# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./pol_lastbatch.json", "r", encoding="utf8") as fin:
lastbatch = list(json.load(fin))
for url in urllist:
if url not in lastbatch and query in url:
proceslist.append(url)
# Save current query to use for next time
with open("./pol_lastbatch.json", "wt", encoding="utf8") as fout:
lastbatch = json.dumps(urllist)
fout.write(lastbatch)
# Request articles and get titles and dates and sort by dates
articlelist = []
pol_therewasanerror = False
for url in proceslist:
try:
if 'https://www.google.com' in url:
start = url.find('url=')+4
end = url.find('&', start)
url = url[start:end]
if not len(url) == 37:
url = url[:37]
data = requests.get(url)
result = data.text
if '"isAccessibleForFree": "True"' not in result:
realurl = data.history[0].headers['Location']
if not "/article" in realurl and not ".ece" in realurl:
start_of_unique_id = realurl.index("/art")+1
end_of_unique_id = realurl[start_of_unique_id:].index("/")
unique_id = realurl[start_of_unique_id:start_of_unique_id+end_of_unique_id]
elif "/article" in realurl and ".ece" in realurl:
start_of_unique_id = realurl.index("/article")+1
end_of_unique_id = realurl[start_of_unique_id:].index(".ece")
unique_id = realurl[start_of_unique_id:start_of_unique_id+end_of_unique_id]
articlelist.append({"id": unique_id, "url": url})
except Exception as e:
print(url)
print(e)
pol_therewasanerror = True
#If something fails, we'll process everything again next time
if pol_therewasanerror == True:
with open("./pol_lastbatch.json", "wt", encoding="utf8") as fout:
urllist = []
lastbatch = json.dumps(urllist)
fout.write(lastbatch)
# Check if article is already posted and update list of posted articles
with open("./pol_published_v2.json", "r", encoding="utf8") as fin:
alreadypublished = list(json.load(fin))
# File below used for paywall.py to update wallnot.dk
for article in articlelist:
hasbeenpublished = False
for published_article in alreadypublished:
if article['id'] == published_article['id']:
hasbeenpublished = True
break
if hasbeenpublished == False:
alreadypublished.append(article)
articlestopost.append(article)
# Save updated already published links
with open("./pol_published_v2.json", "wt", encoding="utf8") as fout:
alreadypublishedjson = json.dumps(alreadypublished)
fout.write(alreadypublishedjson)
# ZETLAND #
# Run search
query = 'zetland.dk/historie'
url = "https://api.twitter.com/2/tweets/search/recent?query={}&{}&{}&{}&{}".format(
query, tweet_fields, media_fields, max_results, start_time
)
headers = {"Authorization": "Bearer {}".format(bearer_token)}
response = requests.request("GET", url, headers=headers)
json_response = response.json()
urllist = list(set(nested_lookup('expanded_url', json_response)))
# Only proces urls that were not in our last Twitter query
proceslist = []
with open("./zet_lastbatch.json", "r", encoding="utf8") as fin:
lastbatch = list(json.load(fin))
for url in urllist:
if url not in lastbatch and query in url:
proceslist.append(url)
# Save current query to use for next time
with open("./zet_lastbatch.json", "wt", encoding="utf8") as fout:
lastbatch = json.dumps(urllist)
fout.write(lastbatch)
# Request articles and get titles and dates and sort by dates
articlelist = []
titlecheck = []
zet_therewasanerror = False
for url in proceslist:
try:
if 'https://www.google.com' in url:
start = url.find('url=')+4
end = url.find('&', start)
url = url[start:end]
data = requests.get(url)
result = data.text
soup = BeautifulSoup(result, "lxml")
title = soup.find('meta', attrs={'property':'og:title'})
title = title['content']
timestamp = soup.find('meta', attrs={'property':'article:published_time'})
timestamp = timestamp['content']
timestamp = timestamp[:timestamp.find("+")]
dateofarticle = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%f')
if title not in titlecheck:
articlelist.append({"title": title, "url": url, "date": dateofarticle})
titlecheck.append(title)
except Exception as e:
print(url)
print(e)
zet_therewasanerror = True
#If something fails, we'll process everything again next time
if zet_therewasanerror == True:
with open("./zet_lastbatch.json", "wt", encoding="utf8") as fout:
urllist = []
lastbatch = json.dumps(urllist)
fout.write(lastbatch)
articlelist_sorted = sorted(articlelist, key=lambda k: k['date'])
# Check if article is already posted and update list of posted articles
with open("./zet_published.json", "r", encoding="utf8") as fin:
alreadypublished = list(json.load(fin))
for art in articlelist_sorted:
title = art['title']
if title not in alreadypublished:
alreadypublished.append(title)
articlestopost.append(art)
# Save updated already published links
with open("./zet_published.json", "wt", encoding="utf8") as fout:
alreadypublishedjson = json.dumps(alreadypublished, ensure_ascii=False)
fout.write(alreadypublishedjson)
# POST TO TWITTER AND FACEBOOK#
friendlyterms = ["flink","rar","gavmild","velinformeret","intelligent","sød","afholdt","bedårende","betagende","folkekær","godhjertet","henrivende","smagfuld","tækkelig","hjertensgod","graciøs","galant","tiltalende","prægtig","kær","godartet","human","indtagende","fortryllende","nydelig","venlig","udsøgt","klog","kompetent","dygtig","ejegod","afholdt","omsorgsfuld","elskværdig","prægtig","skattet","feteret"]
enjoyterms = ["God fornøjelse!", "Nyd den!", "Enjoy!", "God læsning!", "Interessant!", "Spændende!", "Vidunderligt!", "Fantastisk!", "Velsignet!", "Glæd dig!", "Læs den!", "Godt arbejde!", "Wauv!"]
if articlestopost:
for art in articlestopost:
if "zetland" in art['url']:
medium = "@ZetlandMagasin"
else:
medium = "@politiken"
friendlyterm = random.choice(friendlyterms)
enjoyterm = random.choice(enjoyterms)
status = "En " + friendlyterm + " abonnent på " + medium + " har delt en artikel. " + enjoyterm
twitterstatus = status + " " + art['url']
try:
twitterupdate = api.request('statuses/update', {'status': twitterstatus})
except Exception as e:
print(e)
time.sleep(15)
Efter at have hentet en masse flotte fotografier fra internettet, havde jeg brug for lidt grovsortering. Jeg ville fjerne de fotos, der havde for lav opløsning til, at jeg gad kigge på dem og måske på et senere tidspunkt printe dem ud.
Jeg bruger først “walk”-funktionaliteten fra biblioteket os, som lader mig lave en løkke-funktion gennem alle mapper, undermapper og filer fra et sted på min harddisk.
Derefter bruger jeg Pillow til at hente størrelsen på hver led af hvert foto og udregner arealet. Hvis et foto er lig med eller større end 3 megapixels (3 millioner pixels) beholder jeg det. Er det mindre, sletter jeg det.
Her kan du se, hvordan jeg gjorde:
# megapixels.py
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
'''A program to go through a directory and subdirectories and delete
image files below a certain megapixel size.'''
import os # Used to create directories at local destination
from PIL import Image
import PIL
save_location = "C:/Downloads/"
contents = os.walk(save_location)
for root, directories, files in contents:
for file in files:
location = os.path.join(root,file)
if not ".py" in file:
try:
image = Image.open(location)
area = image.size[0]*image.size[1]
if area >= 3000000:
print("stort", location)
image.close()
else:
print("for lille", location)
image.close()
os.remove(location)
except PIL.UnidentifiedImageError:
if ".jpg" in file or ".png" in file or ".jpeg" in file or ".tif" in file:
print("deleting:", location)
os.remove(location)
except PIL.Image.DecompressionBombError:
pass
Hvis du har været på internettet, er du sikkert en gang stødt på sådan ét her:
Mange webadministratorer vælger at skjule disse oversigter over filer på en webserver, som webserversoftwaren Apache kan generere automatisk.
Men jeg opdagede ved et tilfælde, at jeg kunne se, hvad fotoagenturet Magnum havde lagt op i deres WordPress-installation.
Jeg besluttede at forsøge at lave en lokal kopi, så jeg kunne kigge på flotte fotografier uden at skulle vente på downloads fra internettet.
Først forsøgte jeg med Wget, som er et lille program, der er designet til at dublere websteder lokalt. Men Wget havde problemer med at hente og tygge sig igennem de lange lister med filer. En af dem fyldte fx 36 megabytes. Det er altså rigtig mange links.
Derfor lavede jeg et lille Python-program, der kan tygge sig igennem denne type mappe- og filoversigter og downloade dem lokalt.
Her er det:
# apache-directory-downloader.py
# Author: Morten Helmstedt. E-mail: helmstedt@gmail.com
'''A program to fetch files from standard apache directory listings on the internet.
See https://duckduckgo.com/?t=ffab&q=apache%2Bdirectory%2Blisting&ia=images&iax=images
for examples of what this is.'''
import requests # Send http requests and receive responses
from bs4 import BeautifulSoup # Parse HTML data structures, e.g. to search for links
import os # Used to create directories at local destination
import shutil # Used to copy binary files from http response to local destination
import re # Regex parser and search functions
# Terms to exclude, files with these strings in them are not downloaded
exclude = [
"-medium",
"-overlay",
"-teaser-",
"-overlay",
"-thumbnail",
"-collaboration",
"-scaled",
"-photographer-featured",
"-photographer-listing",
"-full-on-mobile",
"-theme-small-teaser",
"-post",
"-large",
"-breaker",
]
# Takes an url and collects all links
def request(url, save_location):
# Print status to let user know that something is going on
print("Requesting:", url)
# Fetch url
response = requests.get(url)
# Parse response
soup = BeautifulSoup(response.text, "lxml")
# Search for all links and exclude certain strings and patterns from links
urllist = [a['href'] for a in soup.find_all('a', href=True) if not '?C=' in a['href'] and not a['href'][0] == "/" and not any(term in a['href'] for term in exclude) and not re.search("\d\d[x]\d\d",a['href'])]
# If status code is not 200 (OK), add url to list of errors
if not response.status_code == 200:
errorlist.append(url)
# Send current url, list of links and current local save collection to scrape function
return scrape(url, urllist, save_location)
def scrape(path, content, save_location):
# Loop through all links
for url in content:
# Print status to let user know that something is going on
print("Parsing/downloading:", path+url)
# If there's a slash ("/") in the link, it is a directory
if "/" in url:
# Create local directory if it doesn't exists
try:
os.mkdir(save_location+url)
except:
pass
# Run request function to fetch contents of directory
request(path+url, save_location+url)
# If the link doesn't contain a slash, it's a file and is saved
else:
# Check if file already exists, e.g. has been downloaded in a prior run
if not os.path.isfile(save_location+url):
# If file doesn't exist, fetch it from remote location
file = requests.get(path+url, stream=True)
# Print status to let user know that something is going on
print("Saving file:", save_location+url)
# Save file to local destination
with open(save_location+url, 'wb') as f:
# Decodes file if received compressed from server
file.raw.decode_content = True
# Copies binary file to local destination
shutil.copyfileobj(file.raw, f)
# List to collect crawling errors
errorlist = []
# Local destination, e.g. 'C:\Downloads' for Windows
save_location = "C:/Downloads/"
# Remote location, e.g. https://example.com/files
url = "https://content.magnumphotos.com/wp-content/uploads/"
# Call function to start crawling
request(url, save_location)
# Print any crawling errors
print(errorlist)
På kukua.dk kan virksomheder i kulturbranchen komme i kontakt med studerende på Institut for Kunst- og Kulturvidenskab på Københavns Universitet. Tænk: Opslagstavle.
Det er gratis for de fattige, (men kreative), virksomheder, som fx får dygtige praktikanter uden at betale en krone for det.
De virksomheder, der har kompetencerne til det, registrerer sig og lægger selv deres opslag op.
Kapitel 2: Jeg får en mail
Den 15. april 2019 modtager jeg dette fra en studentermedhjælp hos virksomheden Copyright Agent:
Kære Kukua.dk Vi er blevet opmærksomme på, at I sandsynligvis har krænket ophavsretten, da vi ikke kan finde belæg for anvendelsen i vores systemer.
Som billedbureau ejer Ritzau Scanpix videresalgsretten til det pågældende billede som er markeret med en rød firkant i det vedhæftede dokument, der yderligere indeholder dokumentation for den krænkelse, vi mener har fundet sted.
På den baggrund er Ritzau Scanpix overfor ophavsmænd, forpligtet til at søge vederlag og godtgørelse, for billeder som er publiceret uberettiget. Selv om det måske ikke har været jeres intention, så er det en krænkelse af fotografens ophavsret at publicere det uden gyldig licens eller tilladelse.
I det vedhæftede materiale er generel information, dokumentation, faktura og opgørelse af kompensation til rettighedshaver samt “Ofte stillede spørgsmål – og svar”.
Da Ritzau Scanpix oplever et stigende antal ophavsretsbrud på deres materiale ser de sig nødsaget til at finde og police deres materiale, så de også i fremtiden kan levere kvalitets materiale til deres kunder.
Copyright Agent samarbejder med en række professionelle fotografer og førende billedbureaueromkring sikring af deres ophavsret på internettet. I kan læse mere om Copyright Agent her: www.copyrightagent.dk
Hvis I har spørgsmål eller dokumentation til sagen, så er I meget velkomne til at besvare denne e-mail eller kontakte os telefonisk på 70 273 272 mandag – fredag fra 9:00 – 17:00.
Oplys venligst dit sagsnummer, hvis du kontakter os telefonisk, så vi har mulighed for at hjælpe dig i den konkrete sag.
Med mailen er vedhæftet en pdf-fil, der fortæller, at jeg har brudt ophavsretslovgivningen, med en faktura på 3.437,50 kr., som jeg skal betale “inden 10 dage fra dags dato“.
Her kan du se pdf-filen – blot har jeg bortcensureret det billede, Copyright Agent har sat ind for at dokumentere min påståede krænkelse af ophavsretten:
Så jeg tænker straks: Det har som sådan ikke noget med mig at gøre. Ligesom politiken.dk ikke er ansvarlig, hvis jeg kommenterer på en artikel med hele teksten fra Syv år for PET, (blot de fjerner den igen når de bliver opmærksom på ophavsretsbruddet), er jeg ikke ansvarlig, når jeg i god tro går ud fra, at mine brugere selvfølgelig har lov til at publicere de fotos, de publicerer – det er jo trods alt deres egen kreative branche, der lever af ophavsretten.
Så jeg svarer fluks:
Kære Fatima
Det er en bruger på siden, der har lagt det pågældende billede op. Alle kan selv registrere sig på siden og lægge indlæg op.
Så vidt jeg kan se, er den pågældende selv ansat på – eller har tilknytning til – Posthus Teatret. I indlægget står hendes telefonnummer, så jeg synes I skal ringe til hende og spørge. Jeg fjerner hjertens gerne indlægget og/eller fotoet fra siden, såfremt jeg modtager en tro og love-erklæring fra jer på, at I ejer ophavsretten på fotoet.
Mvh Morten
Fatima svarer:
Kære Morten Jeg vedhæfter dokumentation for, at Ritzau Scanpix har ophavsretten til billedmaterialet. Vi vil kontakte Posthus Teatret. Tak for hjælpen.
Jeg sletter billedet fra kukua.dk og skriver:
Kære Fatima
Jeg har slettet billedet fra serveren.
Og Fatima svarer:
Kære Morten
Det er noteret, at billedet er fjernet, hvilket vi takker for.
Og jeg tror at alt er godt. Men det er det ikke…
Kapitel 4: Rykkeren
Den 14. maj 2019 modtager jeg en ny mail fra Fatima:
Da betalingsfristen er overskredet, sender vi en lille påmindelse. Ved manglende tilbagemelding i løbet ad ugen, vil vi sende rykkere i sagerne med de oprindelige beløb.
Vi har ordnet sagen – og derfor regner jeg bestemt med, at du frafalder kravet.
Og jeg tror at alt er godt. Men det er det ikke…
Kapitel 5: Inkassovarsel
Den 12. juni modtager jeg denne mail fra – gæt selv – Fatima:
R2, krænkelse af ophavsretten – inkassovarsel
Kære Kukua.dk
Vi har tidligere fremsendt krav om kompensation for krænkelse af vores klients ophavsret. Vi har fortsat ikke registreret jeres betaling og fremsender hermed vedhæftede rykker i sagen.
Copyright Agent samarbejder med en række profesionelle fotografer og førende billedbureauer omkring sikring af deres ophavsret på internettet.
I kan læse mere om copyright Agent her: www.copyrightagent.dk
Hvis I har spørgsmål eller dokumentation til sagen, så er I meget velkomne til at besvare denne e-mail eller kontakte os telefonisk.
Vær sød at ringe til mig ved lejlighed på 25 80 16 54. Vi har allerede afsluttet sagen, men du bliver ved med at kontakte mig.
I øvrigt har jeg også besvaret alle dine tidligere e-mails.
Og for nu at sige det helt klart: Jeg har ikke tænkt mig at betale for en mulig krænkelse af ophavsretten, som det ikke er mig der har begået.
Kære Fatima
Vedhæftet er dokumentation for, hvem der – hvis der er foretaget en krænkelse af ophavsretten – har foretaget den, ved at uploade det pågældende billede til den server, hvor kukua.dk ligger. Du kan rette eventuelle krav til den person.
Jeg beklager den tilsendte rykker, hvilket blev sendt ved en fejl. Vi tager sagen videre med Posthus Teatret.
Jeg håber at det er endt godt for Posthus Teatret. Jeg ved stadig ikke, om de havde lov at benytte billedet af deres biograf, men at Copyright Agent går efter en fattig kulturinstitution for (efter eget udsagn) at hjælpe fattige kreative, viser tydeligt at Copyright Agent kun gør deres fejlbehæftede, inkompetente arbejde for pengenes skyld.
Kapitel 6: Hvorfor dele historien?
Så hvorfor offentliggøre min runde i managen med Copyright Agent?
Så andre, som i den lignende historie med advokatfirmaet Njord, der uberettiget sendte fakturaer på downloadede film til hvem som helst, kan læse om Copyright Agents forretningsmodel og -metoder til skræk, advarsel og måske hjælp, hvis de skulle være så uheldige at modtage en mail fra virksomheden.