Using grab module for parsing

Add: page's head title parser
2017-02-20 17:51:46 +03:00 · 2017-02-19 14:16:40 +03:00
2 changed files with 23 additions and 0 deletions
--- a/hptoad.py
+++ b/hptoad.py
@@ -9,6 +9,7 @@ import subprocess
 import sys
 import time
 import sleekxmpp
+from grab import Grab # page's head title parser

 opts = {
  "muc":      "room@conference.example.com",
@@ -177,6 +178,27 @@ class Hptoad:
        reply = ""
        err = None

+        # --- page's head title parser
+        if not (body.startswith("Link:") or body.startswith("\nLink:")) and not (body.startswith(self.bot_nick)):
+            links = re.findall(r'(http[s]?://\S*)',body)
+            if links:
+                for link in links:
+                    link = link.replace('>','')
+
+                    try:
+                        g = Grab()
+                        g.go(link)
+                        title = g.xpath_text('//title')
+                    except:
+                        title = ""
+
+                    if (title):
+                        if (len(links) > 1):
+                            reply = reply + "\nLink: %s" % title
+                        else:
+                            reply = reply + "Link: %s" % title
+        # --- page's head title parser
+
        # Has to be redone with the current bot nick.
        call_regexp = re.compile("^%s[:,]" % self.bot_nick)

--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 sleekxmpp>=1.2.0
+grab
Author	SHA1	Message	Date
feder	80c08282e9	Using grab module for parsing	2017-02-20 17:51:46 +03:00
feder	31b1dc364f	Add: page's head title parser	2017-02-19 14:16:40 +03:00