bitbake: hashserv: Add Unihash Garbage Collection

Adds support for removing unused unihashes from the database. This is
done using a "mark and sweep" style of garbage collection where a
collection is started by marking which unihashes should be kept in the
database, then performing a sweep to remove any unmarked hashes.

(Bitbake rev: 433d4a075a1acfbd2a2913061739353a84bb01ed)

Signed-off-by: Joshua Watt <JPEWhacker@gmail.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
This commit is contained in:
Joshua Watt
2024-02-18 15:59:46 -07:00
committed by Richard Purdie
parent 324c9fd666
commit 1effd1014d
6 changed files with 684 additions and 116 deletions

View File

@@ -28,6 +28,7 @@ from sqlalchemy import (
delete,
update,
func,
inspect,
)
import sqlalchemy.engine
from sqlalchemy.orm import declarative_base
@@ -36,16 +37,17 @@ from sqlalchemy.exc import IntegrityError
Base = declarative_base()
class UnihashesV2(Base):
__tablename__ = "unihashes_v2"
class UnihashesV3(Base):
__tablename__ = "unihashes_v3"
id = Column(Integer, primary_key=True, autoincrement=True)
method = Column(Text, nullable=False)
taskhash = Column(Text, nullable=False)
unihash = Column(Text, nullable=False)
gc_mark = Column(Text, nullable=False)
__table_args__ = (
UniqueConstraint("method", "taskhash"),
Index("taskhash_lookup_v3", "method", "taskhash"),
Index("taskhash_lookup_v4", "method", "taskhash"),
)
@@ -79,6 +81,36 @@ class Users(Base):
__table_args__ = (UniqueConstraint("username"),)
class Config(Base):
__tablename__ = "config"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(Text, nullable=False)
value = Column(Text)
__table_args__ = (
UniqueConstraint("name"),
Index("config_lookup", "name"),
)
#
# Old table versions
#
DeprecatedBase = declarative_base()
class UnihashesV2(DeprecatedBase):
__tablename__ = "unihashes_v2"
id = Column(Integer, primary_key=True, autoincrement=True)
method = Column(Text, nullable=False)
taskhash = Column(Text, nullable=False)
unihash = Column(Text, nullable=False)
__table_args__ = (
UniqueConstraint("method", "taskhash"),
Index("taskhash_lookup_v3", "method", "taskhash"),
)
class DatabaseEngine(object):
def __init__(self, url, username=None, password=None):
self.logger = logging.getLogger("hashserv.sqlalchemy")
@@ -91,6 +123,9 @@ class DatabaseEngine(object):
self.url = self.url.set(password=password)
async def create(self):
def check_table_exists(conn, name):
return inspect(conn).has_table(name)
self.logger.info("Using database %s", self.url)
self.engine = create_async_engine(self.url, poolclass=NullPool)
@@ -99,6 +134,24 @@ class DatabaseEngine(object):
self.logger.info("Creating tables...")
await conn.run_sync(Base.metadata.create_all)
if await conn.run_sync(check_table_exists, UnihashesV2.__tablename__):
self.logger.info("Upgrading Unihashes V2 -> V3...")
statement = insert(UnihashesV3).from_select(
["id", "method", "unihash", "taskhash", "gc_mark"],
select(
UnihashesV2.id,
UnihashesV2.method,
UnihashesV2.unihash,
UnihashesV2.taskhash,
literal("").label("gc_mark"),
),
)
self.logger.debug("%s", statement)
await conn.execute(statement)
await conn.run_sync(Base.metadata.drop_all, [UnihashesV2.__table__])
self.logger.info("Upgrade complete")
def connect(self, logger):
return Database(self.engine, logger)
@@ -118,6 +171,15 @@ def map_user(row):
)
def _make_condition_statement(table, condition):
where = {}
for c in table.__table__.columns:
if c.key in condition and condition[c.key] is not None:
where[c] = condition[c.key]
return [(k == v) for k, v in where.items()]
class Database(object):
def __init__(self, engine, logger):
self.engine = engine
@@ -135,17 +197,52 @@ class Database(object):
await self.db.close()
self.db = None
async def _execute(self, statement):
self.logger.debug("%s", statement)
return await self.db.execute(statement)
async def _set_config(self, name, value):
while True:
result = await self._execute(
update(Config).where(Config.name == name).values(value=value)
)
if result.rowcount == 0:
self.logger.debug("Config '%s' not found. Adding it", name)
try:
await self._execute(insert(Config).values(name=name, value=value))
except IntegrityError:
# Race. Try again
continue
break
def _get_config_subquery(self, name, default=None):
if default is not None:
return func.coalesce(
select(Config.value).where(Config.name == name).scalar_subquery(),
default,
)
return select(Config.value).where(Config.name == name).scalar_subquery()
async def _get_config(self, name):
result = await self._execute(select(Config.value).where(Config.name == name))
row = result.first()
if row is None:
return None
return row.value
async def get_unihash_by_taskhash_full(self, method, taskhash):
statement = (
select(
OuthashesV2,
UnihashesV2.unihash.label("unihash"),
UnihashesV3.unihash.label("unihash"),
)
.join(
UnihashesV2,
UnihashesV3,
and_(
UnihashesV2.method == OuthashesV2.method,
UnihashesV2.taskhash == OuthashesV2.taskhash,
UnihashesV3.method == OuthashesV2.method,
UnihashesV3.taskhash == OuthashesV2.taskhash,
),
)
.where(
@@ -164,12 +261,12 @@ class Database(object):
async def get_unihash_by_outhash(self, method, outhash):
statement = (
select(OuthashesV2, UnihashesV2.unihash.label("unihash"))
select(OuthashesV2, UnihashesV3.unihash.label("unihash"))
.join(
UnihashesV2,
UnihashesV3,
and_(
UnihashesV2.method == OuthashesV2.method,
UnihashesV2.taskhash == OuthashesV2.taskhash,
UnihashesV3.method == OuthashesV2.method,
UnihashesV3.taskhash == OuthashesV2.taskhash,
),
)
.where(
@@ -208,13 +305,13 @@ class Database(object):
statement = (
select(
OuthashesV2.taskhash.label("taskhash"),
UnihashesV2.unihash.label("unihash"),
UnihashesV3.unihash.label("unihash"),
)
.join(
UnihashesV2,
UnihashesV3,
and_(
UnihashesV2.method == OuthashesV2.method,
UnihashesV2.taskhash == OuthashesV2.taskhash,
UnihashesV3.method == OuthashesV2.method,
UnihashesV3.taskhash == OuthashesV2.taskhash,
),
)
.where(
@@ -234,12 +331,12 @@ class Database(object):
async def get_equivalent(self, method, taskhash):
statement = select(
UnihashesV2.unihash,
UnihashesV2.method,
UnihashesV2.taskhash,
UnihashesV3.unihash,
UnihashesV3.method,
UnihashesV3.taskhash,
).where(
UnihashesV2.method == method,
UnihashesV2.taskhash == taskhash,
UnihashesV3.method == method,
UnihashesV3.taskhash == taskhash,
)
self.logger.debug("%s", statement)
async with self.db.begin():
@@ -248,13 +345,9 @@ class Database(object):
async def remove(self, condition):
async def do_remove(table):
where = {}
for c in table.__table__.columns:
if c.key in condition and condition[c.key] is not None:
where[c] = condition[c.key]
where = _make_condition_statement(table, condition)
if where:
statement = delete(table).where(*[(k == v) for k, v in where.items()])
statement = delete(table).where(*where)
self.logger.debug("%s", statement)
async with self.db.begin():
result = await self.db.execute(statement)
@@ -263,19 +356,74 @@ class Database(object):
return 0
count = 0
count += await do_remove(UnihashesV2)
count += await do_remove(UnihashesV3)
count += await do_remove(OuthashesV2)
return count
async def get_current_gc_mark(self):
async with self.db.begin():
return await self._get_config("gc-mark")
async def gc_status(self):
async with self.db.begin():
gc_mark_subquery = self._get_config_subquery("gc-mark", "")
result = await self._execute(
select(func.count())
.select_from(UnihashesV3)
.where(UnihashesV3.gc_mark == gc_mark_subquery)
)
keep_rows = result.scalar()
result = await self._execute(
select(func.count())
.select_from(UnihashesV3)
.where(UnihashesV3.gc_mark != gc_mark_subquery)
)
remove_rows = result.scalar()
return (keep_rows, remove_rows, await self._get_config("gc-mark"))
async def gc_mark(self, mark, condition):
async with self.db.begin():
await self._set_config("gc-mark", mark)
where = _make_condition_statement(UnihashesV3, condition)
if not where:
return 0
result = await self._execute(
update(UnihashesV3)
.values(gc_mark=self._get_config_subquery("gc-mark", ""))
.where(*where)
)
return result.rowcount
async def gc_sweep(self):
async with self.db.begin():
result = await self._execute(
delete(UnihashesV3).where(
# A sneaky conditional that provides some errant use
# protection: If the config mark is NULL, this will not
# match any rows because No default is specified in the
# select statement
UnihashesV3.gc_mark
!= self._get_config_subquery("gc-mark")
)
)
await self._set_config("gc-mark", None)
return result.rowcount
async def clean_unused(self, oldest):
statement = delete(OuthashesV2).where(
OuthashesV2.created < oldest,
~(
select(UnihashesV2.id)
select(UnihashesV3.id)
.where(
UnihashesV2.method == OuthashesV2.method,
UnihashesV2.taskhash == OuthashesV2.taskhash,
UnihashesV3.method == OuthashesV2.method,
UnihashesV3.taskhash == OuthashesV2.taskhash,
)
.limit(1)
.exists()
@@ -287,15 +435,17 @@ class Database(object):
return result.rowcount
async def insert_unihash(self, method, taskhash, unihash):
statement = insert(UnihashesV2).values(
method=method,
taskhash=taskhash,
unihash=unihash,
)
self.logger.debug("%s", statement)
try:
async with self.db.begin():
await self.db.execute(statement)
await self._execute(
insert(UnihashesV3).values(
method=method,
taskhash=taskhash,
unihash=unihash,
gc_mark=self._get_config_subquery("gc-mark", ""),
)
)
return True
except IntegrityError:
self.logger.debug(
@@ -418,7 +568,7 @@ class Database(object):
async def get_query_columns(self):
columns = set()
for table in (UnihashesV2, OuthashesV2):
for table in (UnihashesV3, OuthashesV2):
for c in table.__table__.columns:
if not isinstance(c.type, Text):
continue