Commit b600284b authored by cynddl's avatar cynddl

URL cleaning for major tracking codes

parent 872a80d1
from url import URL
URL_TRACKERS = [
# UTM
"utm_source", "utm_medium", "utm_campaign", "utm_name",
# FB
"fb", "fb_", "action_type_map", "action_ref_map", "action_object_map",
"PHPSESSID",
"xtor", "link_time", "__scoop",
"_hs", "_hsenc", "_hsmi", "hsCtaTracking",
"wkey", "wemail",
]
def cleanup_url(url_path, default_scheme="http"):
"""
Sanitize a URL string and remove major tracker codes.
Examples
--------
>>> cleanup_url("http//example.com/?utm_source=dlvr.it&utm_medium=twitter&utm_campaign=social")
b'http//example.com/'
>>> cleanup_url("ü.com/??")
b'http://\xc3\xbc.com/'
"""
u = URL.parse(url_path)
u.defrag().deparam(URL_TRACKERS).canonical().unpunycode().strip()
if u.scheme == "":
u.scheme = default_scheme
return u.utf8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment