This repository has been archived on 2023-07-16. You can view files and clone it, but cannot push or open issues or pull requests.
hackaday.io-spambot-hunter/hadsh/db/model.py

671 lines
16 KiB
Python

from functools import partial
from tornado.gen import coroutine, Return
import base64
class Row(object):
"""
Raw row class, simplifies manipulation of tables.
"""
def __init__(self, db, row):
for column in self._COLUMNS_:
if not hasattr(self, column):
setattr(self.__class__, column,
property(\
partial(\
lambda c, s : s._get_col(c), column),
partial(\
lambda c, s, val : s._set_col(c, val), column),
partial(\
lambda c, s : s._del_col(c), column)))
self._data = dict(zip(self._COLUMNS_, row))
self._dirty = {}
self._db = db
@classmethod
@coroutine
def fetch(cls, db, where, *args, single=False):
rows = yield db.query(
'''
SELECT
%(columns)s
FROM
"%(table)s"
WHERE
%(where)s
''' % {
'columns': ', '.join(cls._COLUMNS_),
'table': cls._TABLE_,
'where': where
}, *args)
res = [
cls(db, row) for row in rows
]
if single:
try:
raise Return(res[0])
except IndexError:
raise Return(None)
else:
raise Return(res)
@coroutine
def refresh(self):
# Exclude primary keys from the column list
data_columns = list(filter(
lambda c : c not in set(self._PRIMARY_KEYS_),
self._COLUMNS_))
# Determine the "where" criteria
where_str = ' AND '.join([
'(%s = %%s)' % (col,)
for col in self._PRIMARY_KEYS_
])
where_args = tuple([
self._get_col(c) for c in self._PRIMARY_KEYS_
])
rows = yield self._db.query(
'''
SELECT
%(columns)s
FROM
"%(table)s"
WHERE
%(where)s
''' % {
'columns': ', '.join(data_columns),
'table': self._TABLE_,
'where': where_str
}, *where_args)
if len(rows) != 1:
raise ValueError(
'Row no longer in database or '\
'primary keys ambiguous')
self._data.update(dict(zip(data_columns, rows[0])))
@coroutine
def commit(self):
try:
dirty_data = self._dirty.copy()
self._dirty = {}
dirty_columns = list(dirty_data.keys())
dirty_args = [dirty_data[c] for c in dirty_columns]
where_args = [self._data[c] for c in self._PRIMARY_KEYS_]
sql = '''UPDATE
"%(table)s"
SET
%(updates)s
WHERE
%(criteria)s''' % {
'table': self._TABLE_,
'updates': ', '.join([
'"%s"=%%s' % (c,)
for c in dirty_columns
]),
'criteria': ' AND '.join([
'("%s"=%%s)' % (c,)
for c in self._PRIMARY_KEYS_
])
}
yield self._db.query(sql,
*tuple(dirty_args + where_args),
commit=True)
self._data.update(dirty_data)
except:
dirty_data.update(self._dirty)
self._dirty = dirty_data
raise
def _get_col(self, column):
return self._dirty.get(column, self._data[column])
def _set_col(self, column, value):
self._dirty[column] = value
def _del_col(self, column, value):
self._dirty.pop(column, None)
class User(Row):
"""
All recognised Hackaday.io users, including legitmate ones.
"""
# Table
_TABLE_ = 'user'
# Primary key
_PRIMARY_KEYS_ = ['user_id']
# Columns
_COLUMNS_ = [
'user_id',
'screen_name',
'url',
'avatar_id',
'created',
'had_created',
'last_update'
]
def __repr__(self):
return 'User(user_id=%r, screen_name=%r)' \
% (self.user_id, self.screen_name)
@coroutine
def get_groups(self):
"""
Return a set of groups linked to this user.
"""
groups = yield self._db.query(
'''
SELECT
g.name
FROM
"group" g,
"user_group_assoc" uga
WHERE
g.group_id=uga.group_id
AND
uga.user_id=%s
''', self.user_id)
raise Return(set([
name for (name,) in groups
]))
@coroutine
def set_groups(self, groups):
"""
Set the list of groups linked to this user.
"""
yield self.add_groups(groups)
yield self.mask_groups(groups)
@coroutine
def add_groups(self, groups):
# Add the new groups
yield self._db.query('''
INSERT INTO "user_group_assoc"
(user_id, group_id)
SELECT
%s, group_id
FROM
"group"
WHERE
name IN %s
ON CONFLICT DO NOTHING
''', self.user_id, tuple(groups))
@coroutine
def rm_groups(self, groups):
# Remove groups listed listed
yield self._db.query('''
DELETE FROM "user_group_assoc"
WHERE
user_id=%S
AND
group_id IN (
SELECT
%s, group_id
FROM
"group"
WHERE
name IN %s
)
''', self.user_id, tuple(groups))
@coroutine
def mask_groups(self, groups):
# Remove groups not listed
yield self._db.query('''
DELETE FROM "user_group_assoc"
WHERE
user_id=%S
AND
group_id NOT IN (
SELECT
%s, group_id
FROM
"group"
WHERE
name IN %s
)
''', self.user_id, tuple(groups))
@coroutine
def get_detail(self):
raise Return((yield UserDetail.fetch(self._db,
'user_id=%s LIMIT 1', self.user_id,
single=True)))
@coroutine
def get_deferral(self):
raise Return((yield DeferredUser.fetch(self._db,
'user_id=%s LIMIT 1', self.user_id,
single=True)))
class Account(Row):
_TABLE_ = 'account'
_PRIMARY_KEYS_ = ['user_id']
_COLUMNS_ = [
'user_id',
'name',
'hashedpassword',
'changenextlogin'
]
@coroutine
def get_user(self):
raise Return((yield User.fetch(self._db,
'user_id=%s LIMIT 1', self.user_id,
single=True)))
class DeferredUser(Row):
"""
Object to represent when a user account has been added but inspection
has been deferred.
"""
_TABLE_ = 'deferred_user'
_PRIMARY_KEYS_ = ['user_id']
_COLUMNS_ = [
'user_id',
'inspect_time',
'inspections'
]
class Group(Row):
"""
Groups used for classifying users.
"""
_TABLE_ = 'group'
_PRIMARY_KEYS_ = ['group_id']
_COLUMNS_ = [
'group_id',
'name'
]
def __repr__(self):
return 'Group(group_id=%r, name=%r)' \
% (self.group_id, self.name)
class Session(Row):
"""
Session token storage. The session ID will be emitted to the user, so
we use a UUID field to make the value unguessable.
"""
_TABLE_ = 'session'
_PRIMARY_KEYS_ = ['session_id']
_COLUMNS_ = [
'session_id',
'user_id',
'expiry_date'
]
def __repr__(self):
return 'Session(user=%r)' \
% (self.user)
@coroutine
def get_user(self):
raise Return((yield User.fetch(self._db,
'user_id=%s LIMIT 1', self.user_id,
single=True)))
class UserDetail(Row):
"""
Detail on 'suspect' users. Only users that have been identified as
possibly spammy by the search algorithm, or users that have been flagged
as spammy by logged-in users, appear here.
"""
_TABLE_ = 'user_detail'
_PRIMARY_KEYS_ = ['user_id']
_COLUMNS_ = [
'user_id',
'about_me',
'who_am_i',
'what_i_would_like_to_do',
'location',
'projects'
]
class UserLink(Row):
"""
Links attached to 'suspect' users.
"""
_TABLE_ = 'user_link'
_PRIMARY_KEYS_ = ['user_id', 'url']
_COLUMNS_ = [
'user_id',
'title',
'url'
]
def __repr__(self):
return 'UserLink(user_id=%r, title=%r, url=%r)' \
% (self.user_id, self.title, self.url)
class Avatar(Row):
"""
A cache of users' avatars, as some share the same image.
"""
_TABLE_ = 'avatar'
_PRIMARY_KEYS_ = ['avatar_id']
_COLUMNS_ = [
'avatar_id',
'url',
'avatar',
'avatar_type'
]
@coroutine
def get_hashes(self):
hashes = yield AvatarHash.fetch(self._db,
'''
hash_id IN (
SELECT
hash_id
FROM
"avatar_hash_assoc"
WHERE
avatar_id=%s
)''', self.avatar_id)
raise Return(dict([
(h.hashalgo, h) for h in hashes
]))
class AvatarHash(Row):
"""
A hash of users' avatars and their scores.
"""
_TABLE_ = 'avatar_hash'
_PRIMARY_KEYS_ = ['hash_id']
_COLUMNS_ = [
'hash_id',
'hashalgo',
'hashdata'
]
@property
def hashstr(self):
return base64.a85encode(self.hashdata)
def __repr__(self):
return '<%s #%d %s %s>' % (
self.__class__.__name__,
self.hash_id,
self.hashalgo,
self.hashstr)
class Tag(Row):
"""
A list of tags seen applied to users' accounts.
"""
_TABLE_ = 'tag'
_PRIMARY_KEYS_ = ['tag_id']
_COLUMNS_ = [
'tag_id',
'tag'
]
class NewestUserPageRefresh(Row):
"""
A record of when each page of the "newst users" list was last refreshed.
"""
_TABLE_ = 'newest_user_page_refresh'
_PRIMARY_KEYS_ = ['page_num']
_COLUMNS_ = [
'page_num',
'refresh_date'
]
class NewUser(Row):
"""
A record of a new user that is to be inspected.
"""
_TABLE_ = 'new_user'
_PRIMARY_KEYS_ = ['user_id']
_COLUMNS_ = [
'user_id'
]
class Hostname(Row):
"""
A hostname that appears in the links of user profiles.
"""
_TABLE_ = 'hostname'
_PRIMARY_KEYS_ = ['hostname_id']
_COLUMNS_ = [
'hostname_id',
'hostname',
'score',
'count'
]
def __repr__(self):
return 'Hostname(hostname_id=%r, hostname=%r, score=%r, count=%r)' \
% (self.hostname_id, self.hostname, self.score, self.count)
class Word(Row):
"""
A single word in the vocabulary of the Hackaday.io community.
"""
_TABLE_ = 'word'
_PRIMARY_KEYS_ = ['word_id']
_COLUMNS_ = [
'word_id',
'word',
'score',
'count'
]
def __repr__(self):
return 'Word(word_id=%r, word=%r, score=%r, count=%r)' \
% (self.word_id, self.word, self.score, self.count)
class WordAdjacent(Row):
"""
How often two words appear next to each other.
"""
_TABLE_ = 'word_adjacent'
_PRIMARY_KEYS_ = ['proceeding_id', 'following_id']
_COLUMNS_ = [
'proceeding_id',
'following_id',
'score',
'count'
]
def __repr__(self):
return 'WordAdjacent(proceeding=%r, following=%r, score=%r, count=%r)' \
% (self.proceeding_id, self.following_id, self.score, self.count)
class UserWord(Row):
"""
Words used by a given user
"""
_TABLE_ = 'user_word'
_PRIMARY_KEYS_ = ['user_id', 'word_id']
_COLUMNS_ = [
'user_id',
'word_id',
'count'
]
def __repr__(self):
return 'UserWord(user=%r, word=%r, count=%r)' \
% (self.user_id, self.word_id, self.count)
class UserHostname(Row):
"""
Hostnames used by a given user
"""
_TABLE_ = 'user_hostname'
_PRIMARY_KEYS_ = ['user_id', 'hostname_id']
_COLUMNS_ = [
'user_id',
'hostname_id',
'count'
]
def __repr__(self):
return 'UserHostname(user=%r, hostname=%r, count=%r)' \
% (self.user_id, self.hostname_id, self.count)
class UserWordAdjacent(Row):
"""
Adjacent words used by a given user
"""
_TABLE_ = 'user_word_adjacent'
_PRIMARY_KEYS_ = [
'user_id',
'proceeding_id',
'following_id'
]
_COLUMNS_ = [
'user_id',
'proceeding_id',
'following_id',
'count'
]
def __repr__(self):
return 'UserWordAdjacent(user=%r, proceeding=%r, '\
'following=%r, count=%r)' \
% (self.user_id, self.proceeding_id,
self.following_id,
self.count)
class UserToken(Row):
"""
Suspect tokens found in regular expression search.
"""
_TABLE_ = 'user_token'
_PRIMARY_KEYS_ = ['user_id', 'token']
_COLUMNS_ = [
'user_id',
'token',
'count'
]
class Trait(Row):
"""
Traits are characteristics which are common to subsets of users.
"""
_TABLE_ = 'trait'
_PRIMARY_KEYS_ = ['trait_id']
_COLUMNS_ = [
'trait_id',
'trait_class',
'trait_type',
'score',
'count',
'weight'
]
class TraitInstance(Row):
"""
Instances of particular traits (e.g. if the trait is a 'word',
this may be the score for a particular word).
"""
_TABLE_ = 'trait_instance'
_PRIMARY_KEYS_ = ['trait_inst_id']
_COLUMNS_ = [
'trait_inst_id',
'trait_id',
'score',
'count'
]
@property
def instance(self):
return 't%di%d' % (self.trait_id, self.trait_inst_id)
class TraitInstanceString(TraitInstance):
"""
Trait instance that is described by a string.
"""
_COLUMNS_ = TraitInstance._COLUMNS_ + [
'trait_string'
]
@property
def instance(self):
return self.trait_string
class TraitInstanceAvatarHash(TraitInstance):
"""
Trait instance that references an avatar hash.
"""
_COLUMNS_ = TraitInstance._COLUMNS_ + [
'trait_hash_id'
]
@property
def instance(self):
return self.trait_hash_id
class TraitInstancePair(TraitInstance):
"""
Instances of two other traits that appear together.
"""
_COLUMNS_ = TraitInstance._COLUMNS_ + [
'prev_id',
'next_id'
]
class UserTrait(Row):
"""
Traits linked to a particular user.
"""
_TABLE_ = 'user_trait'
_PRIMARY_KEYS_ = ['user_id', 'trait_id']
_COLUMNS_ = [
'user_id',
'trait_id',
'count'
]
class UserTraitInstance(Row):
"""
Trait instances linked to a particular user.
"""
_TABLE_ = 'user_trait_instance'
_PRIMARY_KEYS_ = ['user_id', 'trait_inst_id']
_COLUMNS_ = [
'user_id',
'trait_inst_id',
'count'
]