Using grab module for parsing

This commit is contained in:
feder 2017-02-20 17:51:46 +03:00
parent 31b1dc364f
commit 80c08282e9
2 changed files with 18 additions and 35 deletions

View File

@ -9,15 +9,7 @@ import subprocess
import sys import sys
import time import time
import sleekxmpp import sleekxmpp
from grab import Grab # page's head title parser
# --- page's head title parser
import requests
from lxml.html import fromstring
from lxml.html import etree
from bs4 import UnicodeDammit
import chardet
# --- page's head title parser
opts = { opts = {
"muc": "room@conference.example.com", "muc": "room@conference.example.com",
@ -186,34 +178,26 @@ class Hptoad:
reply = "" reply = ""
err = None err = None
# --- page's head title parser # --- page's head title parser
if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)): if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
links = re.findall(r'(http[s]?://\S*)',body) links = re.findall(r'(http[s]?://\S*)',body)
if links: if links:
for link in links: for link in links:
link=link.replace('>','') link = link.replace('>','')
# http://stackoverflow.com/questions/29681486/problems-with-encoding-while-parsing-html-document-with-lxml
page = requests.get(link) try:
ud = UnicodeDammit(page.content, is_html=True) g = Grab()
enc = ud.original_encoding.lower() g.go(link)
declared_enc = ud.declared_html_encoding title = g.xpath_text('//title')
if declared_enc: except:
declared_enc = declared_enc.lower() title = ""
if (declared_enc and enc != declared_enc):
detect_dict = chardet.detect(r.content) if (title):
det_conf = detect_dict["confidence"] if (len(links) > 1):
det_enc = detect_dict["encoding"].lower() reply = reply + "\nLink: %s" % title
if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT: else:
enc = declared_enc reply = reply + "Link: %s" % title
content = page.content.decode(enc, "ignore").encode(enc) # --- page's head title parser
htmlparser = etree.HTMLParser(encoding=enc)
root = etree.HTML(content, parser=htmlparser)
title = root.findtext('.//title')
if (len(links) > 1):
reply = reply + "\nLink: %s" % title
else:
reply = reply + "Link: %s" % title
# --- page's head title parser
# Has to be redone with the current bot nick. # Has to be redone with the current bot nick.
call_regexp = re.compile("^%s[:,]" % self.bot_nick) call_regexp = re.compile("^%s[:,]" % self.bot_nick)

View File

@ -1,3 +1,2 @@
sleekxmpp>=1.2.0 sleekxmpp>=1.2.0
bs4 grab
lsxm