This repository has been archived on 2023-07-16. You can view files and clone it, but cannot push or open issues or pull requests.
hackaday.io-spambot-hunter/hadsh/wordstat.py
Stuart Longland 60d0f530bf
wordstat: Handle "invalid" UTF-8.
`pycld` is fussy where it comes to UTF-8 (see
https://github.com/mikemccand/chromium-compact-language-detector/issues/22
and https://github.com/aboSamoor/polyglot/issues/71).  This strips out
the characters that make `cld` choke.

Thanks to @andreoua for the suggested fix.
2018-12-07 21:02:39 +10:00

56 lines
1.2 KiB
Python

#!/usr/bin/env python
from .htmlstrip import html_to_text
from polyglot.text import Text
from string import punctuation
def stripunprintable(s):
"""
Strip non-printable characters
"""
return ''.join(c for c in s if c.isprintable())
def tokenise(html_text):
"""
Return a list of words that appear in the text.
"""
try:
return list(
filter(lambda w : w not in punctuation,
Text(stripunprintable(
html_to_text(html_text))
).lower().words))
except ValueError:
# Empty sequence?
return []
def frequency(wordlist, freq=None):
"""
Scan the word list given and count how often each word appears.
"""
if freq is None:
freq = {}
for w in wordlist:
try:
freq[w] += 1
except KeyError:
freq[w] = 1
return freq
def adjacency(wordlist, freq=None):
"""
Scan the word list and count how often each pair of words appears.
"""
if freq is None:
freq = {}
for prev_w, next_w in zip(wordlist[:-1], wordlist[1:]):
try:
freq[(prev_w, next_w)] += 1
except KeyError:
freq[(prev_w, next_w)] = 1
return freq