From 31b1dc364fd55025597b5b513a5b22081a702f23 Mon Sep 17 00:00:00 2001 From: feder Date: Sun, 19 Feb 2017 14:16:40 +0300 Subject: [PATCH] Add: page's head title parser --- hptoad.py | 38 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 2 files changed, 40 insertions(+) diff --git a/hptoad.py b/hptoad.py index 1c8b337..65eb0cc 100755 --- a/hptoad.py +++ b/hptoad.py @@ -10,6 +10,15 @@ import sys import time import sleekxmpp +# --- page's head title parser +import requests +from lxml.html import fromstring +from lxml.html import etree +from bs4 import UnicodeDammit +import chardet +# --- page's head title parser + + opts = { "muc": "room@conference.example.com", "nick": "botname", @@ -177,6 +186,35 @@ class Hptoad: reply = "" err = None +# --- page's head title parser + if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)): + links = re.findall(r'(http[s]?://\S*)',body) + if links: + for link in links: + link=link.replace('>','') +# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml + page = requests.get(link) + ud = UnicodeDammit(page.content, is_html=True) + enc = ud.original_encoding.lower() + declared_enc = ud.declared_html_encoding + if declared_enc: + declared_enc = declared_enc.lower() + if (declared_enc and enc != declared_enc): + detect_dict = chardet.detect(r.content) + det_conf = detect_dict["confidence"] + det_enc = detect_dict["encoding"].lower() + if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: + enc = declared_enc + content = page.content.decode(enc, "ignore").encode(enc) + htmlparser = etree.HTMLParser(encoding=enc) + root = etree.HTML(content, parser=htmlparser) + title = root.findtext('.//title') + if (len(links) > 1): + reply = reply + "\nLink: %s" % title + else: + reply = reply + "Link: %s" % title +# --- page's head title parser + # Has to be redone with the current bot nick. call_regexp = re.compile("^%s[:,]" % self.bot_nick) diff --git a/requirements.txt b/requirements.txt index 86b3814..c8f5a3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ sleekxmpp>=1.2.0 +bs4 +lsxm