utils.py 834 Bytes
Newer Older
cynddl's avatar
cynddl committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
from url import URL

URL_TRACKERS = [
    # UTM
    "utm_source", "utm_medium", "utm_campaign", "utm_name",

    # FB
    "fb", "fb_", "action_type_map", "action_ref_map", "action_object_map",
    "PHPSESSID",

    "xtor", "link_time", "__scoop",
    "_hs", "_hsenc", "_hsmi", "hsCtaTracking",
    "wkey", "wemail",
]

def cleanup_url(url_path, default_scheme="http"):
    """
    Sanitize a URL string and remove major tracker codes.

    Examples
    --------

    >>> cleanup_url("http//example.com/?utm_source=dlvr.it&utm_medium=twitter&utm_campaign=social")
    b'http//example.com/'

    >>> cleanup_url("ü.com/??")
    b'http://\xc3\xbc.com/'
    """
    u = URL.parse(url_path)
    u.defrag().deparam(URL_TRACKERS).canonical().unpunycode().strip()

    if u.scheme == "":
        u.scheme = default_scheme

    return u.utf8