From 80c08282e91389e1d5453c730e470332cdcf3a70 Mon Sep 17 00:00:00 2001
From: feder <feder@localhost>
Date: Mon, 20 Feb 2017 17:51:46 +0300
Subject: [PATCH] Using grab module for parsing

---
 hptoad.py        | 50 ++++++++++++++++--------------------------------
 requirements.txt |  3 +--
 2 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/hptoad.py b/hptoad.py
index 65eb0cc..d5e71fe 100755
--- a/hptoad.py
+++ b/hptoad.py
@@ -9,15 +9,7 @@ import subprocess
 import sys
 import time
 import sleekxmpp
-
-# --- page's head title parser
-import requests
-from lxml.html import fromstring
-from lxml.html import etree
-from bs4 import UnicodeDammit
-import chardet
-# --- page's head title parser
-
+from grab import Grab # page's head title parser
 
 opts = {
   "muc":      "room@conference.example.com",
@@ -186,34 +178,26 @@ class Hptoad:
         reply = ""
         err = None
 
-# --- page's head title parser
+        # --- page's head title parser
         if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
             links = re.findall(r'(http[s]?://\S*)',body)
             if links:
                 for link in links:
-                    link=link.replace('>','')
-# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml
-                    page = requests.get(link)
-                    ud = UnicodeDammit(page.content, is_html=True)
-                    enc = ud.original_encoding.lower()
-                    declared_enc = ud.declared_html_encoding
-                    if declared_enc:
-                        declared_enc = declared_enc.lower()
-                    if (declared_enc and enc != declared_enc):
-                        detect_dict = chardet.detect(r.content)
-                        det_conf = detect_dict["confidence"]
-                        det_enc = detect_dict["encoding"].lower()
-                        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
-                            enc = declared_enc
-                    content = page.content.decode(enc, "ignore").encode(enc)
-                    htmlparser = etree.HTMLParser(encoding=enc)
-                    root = etree.HTML(content, parser=htmlparser)
-                    title = root.findtext('.//title')
-                    if (len(links) > 1):
-                        reply = reply + "\nLink: %s" % title
-                    else:
-                        reply = reply + "Link: %s" % title
-# --- page's head title parser
+                    link = link.replace('>','')
+
+                    try:
+                        g = Grab()
+                        g.go(link)
+                        title = g.xpath_text('//title')
+                    except:
+                        title = ""
+
+                    if (title):
+                        if (len(links) > 1):
+                            reply = reply + "\nLink: %s" % title
+                        else:
+                            reply = reply + "Link: %s" % title
+        # --- page's head title parser
 
         # Has to be redone with the current bot nick.
         call_regexp = re.compile("^%s[:,]" % self.bot_nick)
diff --git a/requirements.txt b/requirements.txt
index c8f5a3f..528a479 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
 sleekxmpp>=1.2.0
-bs4
-lsxm
+grab