Using grab module for parsing

2017-02-20 17:51:46 +03:00 · 2017-02-20 17:51:46 +03:00 · 80c08282e9
parent 31b1dc364f
commit 80c08282e9
2 changed files with 18 additions and 35 deletions
--- a/hptoad.py
+++ b/hptoad.py
@ -9,15 +9,7 @@ import subprocess
 import sys
 import time
 import sleekxmpp
-
+from grab import Grab # page's head title parser
 # --- page's head title parser
 import requests
 from lxml.html import fromstring
 from lxml.html import etree
 from bs4 import UnicodeDammit
 import chardet
 # --- page's head title parser
 opts = {
  "muc":      "room@conference.example.com",
@ -186,34 +178,26 @@ class Hptoad:
        reply = ""
        err = None
-# --- page's head title parser
+        # --- page's head title parser
        if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
            links = re.findall(r'(http[s]?://\S*)',body)
            if links:
                for link in links:
-                    link=link.replace('>','')
+                    link = link.replace('>','')
-# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml
+
-                    page = requests.get(link)
+                    try:
-                    ud = UnicodeDammit(page.content, is_html=True)
+                        g = Grab()
-                    enc = ud.original_encoding.lower()
+                        g.go(link)
-                    declared_enc = ud.declared_html_encoding
+                        title = g.xpath_text('//title')
-                    if declared_enc:
+                    except:
-                        declared_enc = declared_enc.lower()
+                        title = ""
-                    if (declared_enc and enc != declared_enc):
+
-                        detect_dict = chardet.detect(r.content)
+                    if (title):
                        det_conf = detect_dict["confidence"]
                        det_enc = detect_dict["encoding"].lower()
                        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
                            enc = declared_enc
                    content = page.content.decode(enc, "ignore").encode(enc)
                    htmlparser = etree.HTMLParser(encoding=enc)
                    root = etree.HTML(content, parser=htmlparser)
                    title = root.findtext('.//title')
                        if (len(links) > 1):
                            reply = reply + "\nLink: %s" % title
                        else:
                            reply = reply + "Link: %s" % title
-# --- page's head title parser
+        # --- page's head title parser
        # Has to be redone with the current bot nick.
        call_regexp = re.compile("^%s[:,]" % self.bot_nick)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,2 @@
 sleekxmpp>=1.2.0
-bs4
+grab
 lsxm