From f2c75bd0f6834cbb9eec36d0ba37c021ff0f262c Mon Sep 17 00:00:00 2001 From: Griatch Date: Sun, 18 May 2014 18:28:10 +0200 Subject: [PATCH] Added periodic idmapper cache size check along with conditional flush mechanism. Ran a lot of tests and stress tests to get statistics on usage. --- src/commands/default/system.py | 6 +- src/scripts/scripts.py | 22 +++++- src/server/initial_setup.py | 5 +- src/server/server.py | 3 - src/settings_default.py | 29 ++++---- src/utils/dummyrunner/dummyrunner_actions.py | 4 +- src/utils/dummyrunner/memplot.py | 39 +++++++++-- src/utils/idmapper/base.py | 70 +++++++++++++++++++- 8 files changed, 145 insertions(+), 33 deletions(-) diff --git a/src/commands/default/system.py b/src/commands/default/system.py index 75051ef036..e7410bbe81 100644 --- a/src/commands/default/system.py +++ b/src/commands/default/system.py @@ -13,7 +13,7 @@ import twisted from time import time as timemeasure from django.conf import settings -from src.server.caches import get_cache_sizes +#from src.server.caches import get_cache_sizes from src.server.sessionhandler import SESSIONS from src.scripts.models import ScriptDB from src.objects.models import ObjectDB @@ -678,8 +678,8 @@ class CmdServerLoad(MuxCommand): loadavg = os.getloadavg() psize = _resource.getpagesize() pid = os.getpid() - rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1024.0 # resident memory - vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1024.0 # virtual memory + rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1000.0 # resident memory + vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1000.0 # virtual memory pmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "%mem")).read()) # percent of resident memory to total rusage = resource.getrusage(resource.RUSAGE_SELF) diff --git a/src/scripts/scripts.py b/src/scripts/scripts.py index f43d8aa627..289a655a83 100644 --- a/src/scripts/scripts.py +++ b/src/scripts/scripts.py @@ -19,8 +19,6 @@ __all__ = ["Script", "DoNothing", "CheckSessions", _GA = object.__getattribute__ _SESSIONS = None -# attr-cache size in MB -_ATTRIBUTE_CACHE_MAXSIZE = settings.ATTRIBUTE_CACHE_MAXSIZE class ExtendedLoopingCall(LoopingCall): """ @@ -155,7 +153,7 @@ class ScriptBase(TypeClass): def _step_errback(self, e): "callback for runner errors" cname = self.__class__.__name__ - estring = _("Script %(key)s(#%(dbid)i) of type '%(cname)s': at_repeat() error '%(err)s'.") % \ + estring = _("Script %(key)s(#%(dbid)s) of type '%(cname)s': at_repeat() error '%(err)s'.") % \ {"key": self.key, "dbid": self.dbid, "cname": cname, "err": e.getErrorMessage()} try: @@ -538,6 +536,24 @@ class CheckSessions(Script): #print "ValidateSessions run" _SESSIONS.validate_sessions() +_FLUSH_CACHE = None +_IDMAPPER_CACHE_MAX_MEMORY = settings.IDMAPPER_CACHE_MAXSIZE +class ValidateIdmapperCache(Script): + """ + Check memory use of idmapper cache + """ + def at_script_creation(self): + self.key = "sys_cache_validate" + self.desc = _("Restrains size of idmapper cache.") + self.interval = 61 * 5 # staggered compared to session check + self.persistent = True + + def at_repeat(self): + "Called every ~5 mins" + global _FLUSH_CACHE + if not _FLUSH_CACHE: + from src.utils.idmapper.base import conditional_flush as _FLUSH_CACHE + _FLUSH_CACHE(_IDMAPPER_CACHE_MAX_MEMORY) class ValidateScripts(Script): "Check script validation regularly" diff --git a/src/server/initial_setup.py b/src/server/initial_setup.py index da71b0e016..031abf56a8 100644 --- a/src/server/initial_setup.py +++ b/src/server/initial_setup.py @@ -161,7 +161,10 @@ def create_system_scripts(): script2 = create.create_script(scripts.ValidateScripts) # update the channel handler to make sure it's in sync script3 = create.create_script(scripts.ValidateChannelHandler) - if not script1 or not script2 or not script3: + # flush the idmapper cache + script4 = create.create_script(scripts.ValidateIdmapperCache) + + if not script1 or not script2 or not script3 or not script4: print " Error creating system scripts." diff --git a/src/server/server.py b/src/server/server.py index 31982d5786..9a0c7f1ee5 100644 --- a/src/server/server.py +++ b/src/server/server.py @@ -379,9 +379,6 @@ EVENNIA = Evennia(application) print '-' * 50 print ' %(servername)s Server (%(version)s) started.' % {'servername': SERVERNAME, 'version': VERSION} -if not settings.GAME_CACHE_TYPE: - print " caching disabled" - if AMP_ENABLED: # The AMP protocol handles the communication between diff --git a/src/settings_default.py b/src/settings_default.py index ddfef3d32e..870f294209 100644 --- a/src/settings_default.py +++ b/src/settings_default.py @@ -125,18 +125,23 @@ ENCODINGS = ["utf-8", "latin-1", "ISO-8859-1"] AMP_HOST = 'localhost' AMP_PORT = 5000 AMP_INTERFACE = '127.0.0.1' -# Caching speeds up all forms of database access, often considerably. There -# are (currently) only two settings, "local" or None, the latter of which turns -# off all caching completely. Local caching stores data in the process. It's -# very fast but will go out of sync if more than one process writes to the -# database (such as when using procpool or an extensice web precense). -GAME_CACHE_TYPE = "local" -# Attributes on objects are cached aggressively for speed. If the number of -# objects is large (and their attributes are often accessed) this can use up -# a lot of memory. So every now and then Evennia checks the size of this -# cache and resets it if it's too big. This variable sets the maximum -# size (in MB). -ATTRIBUTE_CACHE_MAXSIZE = 100 +# Database objects are cached in what is known as the idmapper. The idmapper +# caching results in a massive speedup of the server (since it dramatically +# limits the number of database accesses needed) and also allows for +# storing temporary data on objects. It is however also the main memory +# consumer of Evennia. With this setting the cache can be capped and +# flushed when it reaches a certain size. +# Empirically, N_objects_in_cache ~ (RMEM - 35) / 0.0157. +# mem(MB) | objs in cache || mem(MB) | objs in cache +# 50 | ~1000 || 800 | ~49 000 +# 100 | ~4000 || 1200 | ~75 000 +# 200 | ~10 000 || 1600 | ~100 000 +# 500 | ~30 000 || 2000 | ~125 000 +# Note that the exact memory usage is not exact, so err on the side of +# caution if running on a server with limited memory. How many objects +# need to be in memory at any given time depends very much on each game +# and how many players are active. Using None disables the cache cap. +IDMAPPER_CACHE_MAXSIZE = 200 ###################################################################### # Evennia Database config diff --git a/src/utils/dummyrunner/dummyrunner_actions.py b/src/utils/dummyrunner/dummyrunner_actions.py index 01adc966a7..0955005248 100644 --- a/src/utils/dummyrunner/dummyrunner_actions.py +++ b/src/utils/dummyrunner/dummyrunner_actions.py @@ -210,6 +210,4 @@ def c_moves(client): ## "heavy digger memory tester" definition ACTIONS = (c_login, c_logout, - (0.1, c_looks), - (0.1, c_creates_obj), - (0.8, c_digs)) + (1.0, c_digs)) diff --git a/src/utils/dummyrunner/memplot.py b/src/utils/dummyrunner/memplot.py index 44a8129aa2..be6af2d2eb 100644 --- a/src/utils/dummyrunner/memplot.py +++ b/src/utils/dummyrunner/memplot.py @@ -49,12 +49,12 @@ if __name__ == "__main__": vmem = data[:,2] nobj = data[:,3] - # count total amount of objects - #ntot = data[:,3].copy() + # calculate derivative of obj creation + oderiv = (0.5*(nobj[2:] - nobj[:-2]) / (secs[2:] - secs[:-2])).copy() fig = pp.figure() ax1 = fig.add_subplot(111) - ax1.set_title("Memory usage") + ax1.set_title("Memory usage (200 bots, auto-flush at RMEM ~ 200MB)") ax1.set_xlabel("Time (mins)") ax1.set_ylabel("Memory usage (MB)") ax1.plot(secs, rmem, "r", label="RMEM", lw=2) @@ -63,9 +63,36 @@ if __name__ == "__main__": ax2 = ax1.twinx() ax2.plot(secs, nobj, "g--", label="objs in cache", lw=2) - #ax2.plot(secs, ntot, "r--", label="objs total", lw=2) + #ax2.plot(secs[:-2], oderiv/60.0, "g--", label="Objs/second", lw=2) ax2.set_ylabel("Number of objects") ax2.legend(loc="lower right") - ax2.annotate("idmapper\nflush", xy=(70,480)) - ax2.annotate("@reload", xy=(185,600)) + #ax2.annotate("All bots\nfinished\nconnecting", xy=(10, 16900)) + #ax2.annotate("idmapper\nflush", xy=(70,480)) + #ax2.annotate("@reload", xy=(185,600)) + +# # plot mem vs cachesize +# nobj, rmem, vmem = nobj[:262].copy(), rmem[:262].copy(), vmem[:262].copy() +# +# fig = pp.figure() +# ax1 = fig.add_subplot(111) +# ax1.set_title("Memory usage per cache size") +# ax1.set_xlabel("Cache size (number of objects)") +# ax1.set_ylabel("Memory usage (MB)") +# ax1.plot(nobj, rmem, "r", label="RMEM", lw=2) +# ax1.plot(nobj, vmem, "b", label="VMEM", lw=2) +# + +## # empirical estimate of memory usage: rmem = 35.0 + 0.0157 * Ncache +## # Ncache = int((rmem - 35.0) / 0.0157) (rmem in MB) +# +# rderiv_aver = 0.0157 +# fig = pp.figure() +# ax1 = fig.add_subplot(111) +# ax1.set_title("Relation between memory and cache size") +# ax1.set_xlabel("Memory usage (MB)") +# ax1.set_ylabel("Idmapper Cache Size (number of objects)") +# rmem = numpy.linspace(35, 2000, 2000) +# nobjs = numpy.array([int((mem - 35.0) / 0.0157) for mem in rmem]) +# ax1.plot(rmem, nobjs, "r", lw=2) + pp.show() diff --git a/src/utils/idmapper/base.py b/src/utils/idmapper/base.py index b27d1a325d..2a163692b8 100755 --- a/src/utils/idmapper/base.py +++ b/src/utils/idmapper/base.py @@ -7,7 +7,7 @@ leave caching unexpectedly (no use of WeakRefs). Also adds cache_size() for monitoring the size of the cache. """ -import os, threading, gc +import os, threading, gc, time #from twisted.internet import reactor #from twisted.internet.threads import blockingCallFromThread from weakref import WeakValueDictionary @@ -15,10 +15,13 @@ from twisted.internet.reactor import callFromThread from django.core.exceptions import ObjectDoesNotExist, FieldError from django.db.models.base import Model, ModelBase from django.db.models.signals import post_save, pre_delete, post_syncdb +from src.utils import logger from src.utils.utils import dbref, get_evennia_pids, to_str from manager import SharedMemoryManager +AUTO_FLUSH_MIN_INTERVAL = 60.0 * 5 # at least 5 mins between cache flushes + _GA = object.__getattribute__ _SA = object.__setattr__ _DA = object.__delattr__ @@ -36,7 +39,6 @@ _SERVER_PID, _PORTAL_PID = get_evennia_pids() _IS_SUBPROCESS = (_SERVER_PID and _PORTAL_PID) and not _SELF_PID in (_SERVER_PID, _PORTAL_PID) _IS_MAIN_THREAD = threading.currentThread().getName() == "MainThread" - class SharedMemoryModelBase(ModelBase): # CL: upstream had a __new__ method that skipped ModelBase's __new__ if # SharedMemoryModelBase was not in the model class's ancestors. It's not @@ -225,6 +227,7 @@ class SharedMemoryModel(Model): Method to store an instance in the cache. """ if instance._get_pk_val() is not None: + cls.__instance_cache__[instance._get_pk_val()] = instance cache_instance = classmethod(cache_instance) @@ -301,6 +304,7 @@ class WeakSharedMemoryModelBase(SharedMemoryModelBase): cls.__instance_cache__ = WeakValueDictionary() cls._idmapper_recache_protection = False + class WeakSharedMemoryModel(SharedMemoryModel): """ Uses a WeakValue dictionary for caching instead of a regular one @@ -309,6 +313,7 @@ class WeakSharedMemoryModel(SharedMemoryModel): class Meta: abstract = True + def flush_cache(**kwargs): """ Flush idmapper cache. When doing so the cache will @@ -346,6 +351,7 @@ def flush_cached_instance(sender, instance, **kwargs): sender.flush_cached_instance(instance) pre_delete.connect(flush_cached_instance) + def update_cached_instance(sender, instance, **kwargs): """ Re-cache the given instance in the idmapper cache @@ -355,6 +361,66 @@ def update_cached_instance(sender, instance, **kwargs): sender.cache_instance(instance) post_save.connect(update_cached_instance) + +LAST_FLUSH = None +def conditional_flush(max_rmem, force=False): + """ + Flush the cache if the estimated memory usage exceeds max_rmem. + + The flusher has a timeout to avoid flushing over and over + in particular situations (this means that for some setups + the memory usage will exceed the requirement and a server with + more memory is probably required for the given game) + + force - forces a flush, regardless of timeout. + """ + global LAST_FLUSH + + def mem2cachesize(desired_rmem): + """ + Estimate the size of the idmapper cache based on the memory + desired. This is used to optionally cap the cache size. + + desired_rmem - memory in MB (minimum 50MB) + + The formula is empirically estimated from usage tests (Linux) + and is + Ncache = RMEM - 35.0 / 0.0157 + where RMEM is given in MB and Ncache is the size of the cache + for this memory usage. VMEM tends to be about 100MB higher + than RMEM for large memory usage. + """ + vmem = max(desired_rmem, 50.0) + Ncache = int(abs(float(vmem) - 35.0) / 0.0157) + return Ncache + + if not max_rmem: + # auto-flush is disabled + return + + now = time.time() + if not LAST_FLUSH: + # server is just starting + LAST_FLUSH = now + return + + if ((now - LAST_FLUSH) < AUTO_FLUSH_MIN_INTERVAL) and not force: + # too soon after last flush. + logger.log_warnmsg("Warning: Idmapper flush called more than "\ + "once in %s min interval. Check memory usage." % (AUTO_FLUSH_MIN_INTERVAL/60.0)) + return + + # check actual memory usage + Ncache_max = mem2cachesize(max_rmem) + Ncache, _ = cache_size() + actual_rmem = float(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), "rss")).read()) / 1000.0 # resident memory + + if Ncache >= Ncache_max and actual_rmem > max_rmem * 0.9: + # flush cache when number of objects in cache is big enough and our + # actual memory use is within 10% of our set max + flush_cache() + LAST_FLUSH = now + def cache_size(mb=True): """ Calculate statistics about the cache.