Using grab module for parsing
This commit is contained in:
parent
31b1dc364f
commit
80c08282e9
42
hptoad.py
42
hptoad.py
|
@ -9,15 +9,7 @@ import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import sleekxmpp
|
import sleekxmpp
|
||||||
|
from grab import Grab # page's head title parser
|
||||||
# --- page's head title parser
|
|
||||||
import requests
|
|
||||||
from lxml.html import fromstring
|
|
||||||
from lxml.html import etree
|
|
||||||
from bs4 import UnicodeDammit
|
|
||||||
import chardet
|
|
||||||
# --- page's head title parser
|
|
||||||
|
|
||||||
|
|
||||||
opts = {
|
opts = {
|
||||||
"muc": "room@conference.example.com",
|
"muc": "room@conference.example.com",
|
||||||
|
@ -186,34 +178,26 @@ class Hptoad:
|
||||||
reply = ""
|
reply = ""
|
||||||
err = None
|
err = None
|
||||||
|
|
||||||
# --- page's head title parser
|
# --- page's head title parser
|
||||||
if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
|
if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
|
||||||
links = re.findall(r'(http[s]?://\S*)',body)
|
links = re.findall(r'(http[s]?://\S*)',body)
|
||||||
if links:
|
if links:
|
||||||
for link in links:
|
for link in links:
|
||||||
link=link.replace('>','')
|
link = link.replace('>','')
|
||||||
# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml
|
|
||||||
page = requests.get(link)
|
try:
|
||||||
ud = UnicodeDammit(page.content, is_html=True)
|
g = Grab()
|
||||||
enc = ud.original_encoding.lower()
|
g.go(link)
|
||||||
declared_enc = ud.declared_html_encoding
|
title = g.xpath_text('//title')
|
||||||
if declared_enc:
|
except:
|
||||||
declared_enc = declared_enc.lower()
|
title = ""
|
||||||
if (declared_enc and enc != declared_enc):
|
|
||||||
detect_dict = chardet.detect(r.content)
|
if (title):
|
||||||
det_conf = detect_dict["confidence"]
|
|
||||||
det_enc = detect_dict["encoding"].lower()
|
|
||||||
if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
|
|
||||||
enc = declared_enc
|
|
||||||
content = page.content.decode(enc, "ignore").encode(enc)
|
|
||||||
htmlparser = etree.HTMLParser(encoding=enc)
|
|
||||||
root = etree.HTML(content, parser=htmlparser)
|
|
||||||
title = root.findtext('.//title')
|
|
||||||
if (len(links) > 1):
|
if (len(links) > 1):
|
||||||
reply = reply + "\nLink: %s" % title
|
reply = reply + "\nLink: %s" % title
|
||||||
else:
|
else:
|
||||||
reply = reply + "Link: %s" % title
|
reply = reply + "Link: %s" % title
|
||||||
# --- page's head title parser
|
# --- page's head title parser
|
||||||
|
|
||||||
# Has to be redone with the current bot nick.
|
# Has to be redone with the current bot nick.
|
||||||
call_regexp = re.compile("^%s[:,]" % self.bot_nick)
|
call_regexp = re.compile("^%s[:,]" % self.bot_nick)
|
||||||
|
|
|
@ -1,3 +1,2 @@
|
||||||
sleekxmpp>=1.2.0
|
sleekxmpp>=1.2.0
|
||||||
bs4
|
grab
|
||||||
lsxm
|
|
||||||
|
|
Loading…
Reference in New Issue