Add: page's head title parser

2017-02-19 14:16:40 +03:00
parent 9f2ea9cc1d
commit 31b1dc364f
2 changed files with 40 additions and 0 deletions
@@ -10,6 +10,15 @@ import sys
 import time
 import sleekxmpp

+# --- page's head title parser
+import requests
+from lxml.html import fromstring
+from lxml.html import etree
+from bs4 import UnicodeDammit
+import chardet
+# --- page's head title parser
+
+
 opts = {
  "muc":      "room@conference.example.com",
  "nick":     "botname",
@@ -177,6 +186,35 @@ class Hptoad:
        reply = ""
        err = None

+# --- page's head title parser
+        if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
+            links = re.findall(r'(http[s]?://\S*)',body)
+            if links:
+                for link in links:
+                    link=link.replace('>','')
+# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml
+                    page = requests.get(link)
+                    ud = UnicodeDammit(page.content, is_html=True)
+                    enc = ud.original_encoding.lower()
+                    declared_enc = ud.declared_html_encoding
+                    if declared_enc:
+                        declared_enc = declared_enc.lower()
+                    if (declared_enc and enc != declared_enc):
+                        detect_dict = chardet.detect(r.content)
+                        det_conf = detect_dict["confidence"]
+                        det_enc = detect_dict["encoding"].lower()
+                        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
+                            enc = declared_enc
+                    content = page.content.decode(enc, "ignore").encode(enc)
+                    htmlparser = etree.HTMLParser(encoding=enc)
+                    root = etree.HTML(content, parser=htmlparser)
+                    title = root.findtext('.//title')
+                    if (len(links) > 1):
+                        reply = reply + "\nLink: %s" % title
+                    else:
+                        reply = reply + "Link: %s" % title
+# --- page's head title parser
+
        # Has to be redone with the current bot nick.
        call_regexp = re.compile("^%s[:,]" % self.bot_nick)

@@ -1 +1,3 @@
 sleekxmpp>=1.2.0
+bs4
+lsxm