diff --git a/wantzel.py b/wantzel.py index 73f0c54be26c205925daf41a43a263437be3cbca..3e840df2fe13172268aa8af449bb6c6552ec262f 100644 --- a/wantzel.py +++ b/wantzel.py @@ -34,19 +34,15 @@ def get_cursor(): return db.cursor() return None -def get_url(message, command=""): +def get_url(message): """ - Retrieve the url behind the command. + Retrieve the url in the message. """ - # Let's get what is behind the command - result = re.search("!%s ([^ ]*)" % command, message) - if not result: - return "" - url = result.group(1) - # Verify the presence of http - result = re.search("^(https?://)(.+)$", url) + # Let's get the url + result = re.search("(https?[^ ]+)", message) if not result: return "http" + url = result.group(1) # Removing anchor if needed result = re.search("^([^#]*)", url) if result: @@ -55,6 +51,25 @@ def get_url(message, command=""): url = re.sub("[?&](utm_medium|utm_source|utm_campaign|xtor)=[^&]*", "", url) return url +def get_title(message): + title = "" + website = "" + try: + url = get_url(message)#re.search("(http[^ ]*)", msg).group(1) + website = re.search("//([^/]*)", url).group(1) + f = urllib.URLopener().open(url) + content = f.read() + title = re.search("([^<]+)", content).group(1) + except: + pass + # Unescaping HTML entities + if title: + title = re.sub(">", ">", title) + title = re.sub("<", "<", title) + title = re.sub(""", '"', title) + title = re.sub("&", "&", title) + return (title, website) + class Wantzel(object): """ Wantzel bot. @@ -98,6 +113,10 @@ class Wantzel(object): # Cleaning user name user = re.search("([^!]*)!", user).group(1) print("Message received: %s %s %s" % (user, channel, msg)) + # Whatever is done, get the title of an existing url in a message + title = "" + if "http" in msg: + title, website = get_title(msg) # Never answer to botself if user!=config.nickname: # If it's a query, bot should answer to the user as the channel @@ -112,28 +131,15 @@ class Wantzel(object): command = command.group(1) print("Command: %s" % command) if command.startswith("rp"): - self.rp(command, user, channel, msg) + self.rp(command, user, channel, msg, title) elif command=="help": self.help(user, channel, msg) elif command=="kill": self.kill(user, channel, msg) elif command=="stats": self.stats(user, channel, msg) - # Whatever is done, get the title of an existing url in a message - if "http" in msg: - content = "" - title = "" - website = "" - try: - url = re.search("(http[^ ]*)", msg).group(1) - website = re.search("//([^/]*)", url).group(1) - f = urllib.URLopener().open(url) - content = f.read() - title = re.search("([^<]+)", content).group(1) - except: - pass - if title and website: - self.send_message(channel, messages["title"] % (title, website)) + if title and website: + self.send_message(channel, messages["title"] % (title, website)) def on_joined(self, channel): """ @@ -161,14 +167,14 @@ class Wantzel(object): else: self.send_message(channel, messages["help"]) - def rp(self, command, user, channel, msg): + def rp(self, command, user, channel, msg, title=""): """ Adding the article in rp database. """ print("rp command %s" % command) cite = 0 note = 0 - url = get_url(msg, command) + url = get_url(msg) print("url: %s" % url) if url=="": return @@ -194,8 +200,11 @@ class Wantzel(object): # lang, published, nid, screenshot, title, fetched, seemscite print("Adding an article by %s: %s" % (user, url)) result = cursor.execute( - "INSERT INTO presse SET url=%s, provenance=%s, cite=%s, note=%s, datec=NOW()", - (url, user, cite, note) + """INSERT INTO presse SET + url=%s, provenance=%s, cite=%s, note=%s, datec=NOW(), title=%s, + lang='', published=0, nid=0, screenshot=0, fetched=0, seemscite=0 + """, + (url, user, cite, note, title) ) self.send_message(channel, messages["rp_new_article"] % user) else: @@ -216,7 +225,7 @@ class Wantzel(object): """ #TODO: Gérer les droits de cette commande print("kill command") - url = get_url(msg, "kill") + url = get_url(msg) print("url: %s" % url) if url=="": return