From 3ac39b5a0060a2f0f76dc666aa3eceb07ef32b09 Mon Sep 17 00:00:00 2001 From: Stuart Longland Date: Fri, 2 Feb 2018 23:15:32 +1000 Subject: [PATCH] wordstat: Add in word statistics parsing. --- hadsh/wordstat.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 hadsh/wordstat.py diff --git a/hadsh/wordstat.py b/hadsh/wordstat.py new file mode 100644 index 0000000..ac9dc05 --- /dev/null +++ b/hadsh/wordstat.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +from .htmlstrip import html_to_text +from polyglot.text import Text + + +def tokenise(html_text): + """ + Return a list of words that appear in the text. + """ + return list(Text(html_to_text(html_text)).lower().words) + + +def frequency(wordlist, freq=None): + """ + Scan the word list given and count how often each word appears. + """ + if freq is None: + freq = {} + for w in wordlist: + try: + freq[w] += 1 + except KeyError: + freq[w] = 1 + return freq + + +def adjacency(wordlist, freq=None): + """ + Scan the word list and count how often each pair of words appears. + """ + if freq is None: + freq = {} + for prev_w, next_w in zip(wordlist[:-1], wordlist[1:]): + try: + freq[(prev_w, next_w)] += 1 + except KeyError: + freq[(prev_w, next_w)] = 1 + return freq