From 4ac6c9e919350400da0a7e39c85fd0af6b9ed625 Mon Sep 17 00:00:00 2001 From: Guy Halse Date: Fri, 21 Jun 2024 17:18:38 +0200 Subject: [PATCH] Fix attribute encoding when using Shibboleth RFC2616 states that HTTP headers are encoded in latin1 (iso-8859-1), and the Python/Django request.META (correctly) assumes that incoming headers will be encoded in this way. However, by default, Shibboleth ignores the iso-8859-1 restriction and puts the UTF-8 encoded values from SAML into its request headers with ShibUseHeaders without transliteration [ref](https://shibboleth.atlassian.net/wiki/spaces/SP3/pages/2065334723/ContentSettings)]. This results in incorrectly encoded characters when non-ASCII / accented characters are used in e.g. the first or last name. There are two ways we could fix this. The approach used here is to simply acknowledge the incorrect encoding and fix it (i.e. force the string to be interpreted as UTF-8 rather than Latin1. This is backwards compatible and will be invisible to any sites that don't already have incorrectly encoded names. The alternative would be to make use of Shibboleth's `ShibRequestSetting encoding URL` option in the Apache config to force Shibboleth to URL encode the string. We would then have to decode it when we consumed it. This approach is arguably more correct since the headers would be RFC compliant, but involves much more work and requires users change their webserver config. It's not backwards compatible. --- edumanage/views.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/edumanage/views.py b/edumanage/views.py index 8fc9af75..b008a6a8 100644 --- a/edumanage/views.py +++ b/edumanage/views.py @@ -2778,7 +2778,11 @@ def lookupShibAttr(attrmap, requestMeta): for attr in attrmap: if (attr in requestMeta.keys()): if len(requestMeta[attr]) > 0: - return requestMeta[attr] + # HTTP headers are encoded in latin1 (RFC2616). However, by default + # Shibboleth SP ignores this and puts UTF-8 encoded values into its + # request headers with ShibUseHeaders. So we need to fix up the + # resulting misencoding of accented characters + return bytearray(requestMeta[attr], "iso-8859-1").decode("utf-8") return '' # def get_i18n_name(i18n_name, lang, default_lang='en', default_name='unknown'):