From 80c08282e91389e1d5453c730e470332cdcf3a70 Mon Sep 17 00:00:00 2001 From: feder Date: Mon, 20 Feb 2017 17:51:46 +0300 Subject: [PATCH] Using grab module for parsing --- hptoad.py | 50 ++++++++++++++++-------------------------------- requirements.txt | 3 +-- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/hptoad.py b/hptoad.py index 65eb0cc..d5e71fe 100755 --- a/hptoad.py +++ b/hptoad.py @@ -9,15 +9,7 @@ import subprocess import sys import time import sleekxmpp - -# --- page's head title parser -import requests -from lxml.html import fromstring -from lxml.html import etree -from bs4 import UnicodeDammit -import chardet -# --- page's head title parser - +from grab import Grab # page's head title parser opts = { "muc": "room@conference.example.com", @@ -186,34 +178,26 @@ class Hptoad: reply = "" err = None -# --- page's head title parser + # --- page's head title parser if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)): links = re.findall(r'(http[s]?://\S*)',body) if links: for link in links: - link=link.replace('>','') -# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml - page = requests.get(link) - ud = UnicodeDammit(page.content, is_html=True) - enc = ud.original_encoding.lower() - declared_enc = ud.declared_html_encoding - if declared_enc: - declared_enc = declared_enc.lower() - if (declared_enc and enc != declared_enc): - detect_dict = chardet.detect(r.content) - det_conf = detect_dict["confidence"] - det_enc = detect_dict["encoding"].lower() - if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: - enc = declared_enc - content = page.content.decode(enc, "ignore").encode(enc) - htmlparser = etree.HTMLParser(encoding=enc) - root = etree.HTML(content, parser=htmlparser) - title = root.findtext('.//title') - if (len(links) > 1): - reply = reply + "\nLink: %s" % title - else: - reply = reply + "Link: %s" % title -# --- page's head title parser + link = link.replace('>','') + + try: + g = Grab() + g.go(link) + title = g.xpath_text('//title') + except: + title = "" + + if (title): + if (len(links) > 1): + reply = reply + "\nLink: %s" % title + else: + reply = reply + "Link: %s" % title + # --- page's head title parser # Has to be redone with the current bot nick. call_regexp = re.compile("^%s[:,]" % self.bot_nick) diff --git a/requirements.txt b/requirements.txt index c8f5a3f..528a479 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ sleekxmpp>=1.2.0 -bs4 -lsxm +grab