wordstat: Add in word statistics parsing.

This commit is contained in:
Stuart Longland 2018-02-02 23:15:32 +10:00
parent 1d9c2a49c2
commit 3ac39b5a00
Signed by: stuartl
GPG Key ID: 6AA32EFB18079BAA

39
hadsh/wordstat.py Normal file
View File

@ -0,0 +1,39 @@
#!/usr/bin/env python
from .htmlstrip import html_to_text
from polyglot.text import Text
def tokenise(html_text):
"""
Return a list of words that appear in the text.
"""
return list(Text(html_to_text(html_text)).lower().words)
def frequency(wordlist, freq=None):
"""
Scan the word list given and count how often each word appears.
"""
if freq is None:
freq = {}
for w in wordlist:
try:
freq[w] += 1
except KeyError:
freq[w] = 1
return freq
def adjacency(wordlist, freq=None):
"""
Scan the word list and count how often each pair of words appears.
"""
if freq is None:
freq = {}
for prev_w, next_w in zip(wordlist[:-1], wordlist[1:]):
try:
freq[(prev_w, next_w)] += 1
except KeyError:
freq[(prev_w, next_w)] = 1
return freq