From 3ac39b5a0060a2f0f76dc666aa3eceb07ef32b09 Mon Sep 17 00:00:00 2001
From: Stuart Longland <me@vk4msl.id.au>
Date: Fri, 2 Feb 2018 23:15:32 +1000
Subject: [PATCH] wordstat: Add in word statistics parsing.

---
 hadsh/wordstat.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 hadsh/wordstat.py

diff --git a/hadsh/wordstat.py b/hadsh/wordstat.py
new file mode 100644
index 0000000..ac9dc05
--- /dev/null
+++ b/hadsh/wordstat.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+from .htmlstrip import html_to_text
+from polyglot.text import Text
+
+
+def tokenise(html_text):
+    """
+    Return a list of words that appear in the text.
+    """
+    return list(Text(html_to_text(html_text)).lower().words)
+
+
+def frequency(wordlist, freq=None):
+    """
+    Scan the word list given and count how often each word appears.
+    """
+    if freq is None:
+        freq = {}
+    for w in wordlist:
+        try:
+            freq[w] += 1
+        except KeyError:
+            freq[w] = 1
+    return freq
+
+
+def adjacency(wordlist, freq=None):
+    """
+    Scan the word list and count how often each pair of words appears.
+    """
+    if freq is None:
+        freq = {}
+    for prev_w, next_w in zip(wordlist[:-1], wordlist[1:]):
+        try:
+            freq[(prev_w, next_w)] += 1
+        except KeyError:
+            freq[(prev_w, next_w)] = 1
+    return freq