Make OCR support optional; other OCR patches

3 years ago · 6ebf952dd8
parent 736480c156
commit 6ebf952dd8
4 changed files with 72 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -17,6 +17,13 @@ sudo -i -u cringebot
 pip3 install Mastodon.py html2text
 ```

+#### Install dependencies (OCR support)
+```
+apt install python3 bogofilter tesseract-ocr
+sudo -i -u cringebot
+pip3 install Mastodon.py html2text PIL
+```
+
 #### Create configuration
 ```
 sudo -i -u cringebot
--- a/bogofilter.py
+++ b/bogofilter.py
@ -49,6 +49,9 @@ class Mail:
        self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""}
        self.change_body(body)

+    def get_body(self):
+        return "\n".join(self.body)
+
    def change_body(self, body):
        if isinstance(body, str):
            self.body = body.strip().split("\n")
--- a/config.example.toml
+++ b/config.example.toml
@ -16,6 +16,9 @@ poll_interval = 15
 # Minutes until cringe statuses are deleted
 max_age = 600

+# Enable OCR support for image attachments
+#ocr = false
+
 [clients]
    # The client name (in quotes below) is displayed in log messages, and is
    # used as the default hostname of the server to connect to, and as the
--- a/cringebot.py
+++ b/cringebot.py
@ -3,10 +3,22 @@ import sys
 import time
 import sched
 from datetime import datetime, timezone, timedelta
-import bogofilter
-import html2text
 from collections import deque
 import toml
+import re
+
+import bogofilter
+import html2text
+import urllib
+
+config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
+loaded_config = toml.load(config_path)
+
+OCR = loaded_config.get("ocr", False)
+
+if OCR:
+    from PIL import Image
+    import pytesseract

 from mastodon import Mastodon, MastodonNotFoundError

@ -129,22 +141,27 @@ class CringeBotClient(BotClient):
            
            # Fetch the target status
            target_status = self.api.status(target_id)
-            target_mail_text = toot_dict_to_mail(target_status).format()
+            target_mail = toot_dict_to_mail(target_status)
+            target_mail_text = target_mail.format()
+            
            
            tokens = deque(command.split())
            while True:
                token = tokens.popleft()
                if token == "cringe":
-                    if event not in ["categorise", "learn"]:
+                    if len(target_mail.body) == 0:
+                        self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
+                        return True
+                    elif event not in ["categorise", "learn"]:
                        self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
                        break
                    elif self.is_cringe(target_id):
                        self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id})
                        break
                    elif self.is_based(target_id):
-                        bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM])
                    else:
-                        bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_SPAM])
                    
                    self.make_cringe(target_id, target_mail_text)
                    self.enqueue_deletion(target_id)
@ -153,16 +170,19 @@ class CringeBotClient(BotClient):
                    break

                elif token == "based":
-                    if event not in ["categorise", "learn"]:
+                    if len(target_mail.body) == 0:
+                        self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
+                        return True
+                    elif event not in ["categorise", "learn"]:
                        self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
                        break
                    elif self.is_based(target_id):
                        self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id})
                        break
                    elif self.is_cringe(target_id):
-                        bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM])
                    else:
-                        bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_HAM])
                    
                    self.make_based(target_id, target_mail_text)
                    self.unqueue_deletion(target_id)
@ -171,16 +191,19 @@ class CringeBotClient(BotClient):
                    break

                elif token == "unlearn":
-                    if event not in ["categorise", "learn"]:
+                    if len(target_mail.body) == 0:
+                        self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
+                        return True
+                    elif event not in ["categorise", "learn"]:
                        self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
                        break
                    elif self.is_unsure(target_id):
                        self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id})
                        break
                    elif self.is_cringe(target_id):
-                        bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM])
                    elif self.is_based(target_id):
-                        bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir)
+                        bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM])

                    self.make_unsure(target_id, target_mail_text)
                    self.unqueue_deletion(target_id)
@ -211,23 +234,22 @@ class CringeBotClient(BotClient):
            return
       
        # Create faux HTML email of status
-        mail_text = toot_dict_to_mail(status).format()
- 
-        # Format and log plain-text preview
-        md_text = self.h2t.handle(status["content"])
-        preview = toot_dict_to_mail(status)
-        preview.change_body(md_text)
-        preview_text = preview.format()
+        mail = toot_dict_to_mail(status)
+        mail_text = mail.format()
       
        self.log()
-        self.log(preview_text)
+        self.log(mail_text)
        self.log()

+        if len(mail.body) == 0:
+            self.log("Not classifying {} because it has no content".format(status_id))
+            return
+
        # Process any commands
        if self.process_commands(status):
            return

-        result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir)
+        result = bogofilter.run(mail_text, db_dir = self.db_dir, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY])
        bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score))
        if result.category == bogofilter.SPAM:
            self.log("CRINGE: Enqueuing status {} for deletion".format(status_id))
@ -323,10 +345,23 @@ def toot_dict_to_mail(toot_dict):

    body = toot_dict["content"]

+    if OCR:
+        for media in toot_dict["media_attachments"]:
+            if media["type"] == "image":
+                try:
+                    with urllib.request.urlopen(media["url"]) as image:
+                        ocr_text = pytesseract.image_to_string(Image.open(image))
+                        words = re.findall(r"\w+", ocr_text)
+                        tokens = ["ocr_" + word.lower() for word in words]
+                        body += "\n\n" + " ".join(tokens)
+                except Exception:
+                    print("Skipping OCR on attachment due to exception")
+                    print(traceback.format_exc())
+
    return bogofilter.Mail(headers = headers, body = body)

-config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
-bot = Bot(CringeBotClient, toml.load(config_path))
+bot = Bot(CringeBotClient, loaded_config)
+del loaded_config
 bot.start()

 while True: