From f2c75bd0f6834cbb9eec36d0ba37c021ff0f262c Mon Sep 17 00:00:00 2001
From: Griatch <griatch@gmail.com>
Date: Sun, 18 May 2014 18:28:10 +0200
Subject: [PATCH] Added periodic idmapper cache size check along with
 conditional flush mechanism. Ran a lot of tests and stress tests to get
 statistics on usage.

---
 src/commands/default/system.py               |  6 +-
 src/scripts/scripts.py                       | 22 +++++-
 src/server/initial_setup.py                  |  5 +-
 src/server/server.py                         |  3 -
 src/settings_default.py                      | 29 ++++----
 src/utils/dummyrunner/dummyrunner_actions.py |  4 +-
 src/utils/dummyrunner/memplot.py             | 39 +++++++++--
 src/utils/idmapper/base.py                   | 70 +++++++++++++++++++-
 8 files changed, 145 insertions(+), 33 deletions(-)

diff --git a/src/commands/default/system.py b/src/commands/default/system.py
index 75051ef036..e7410bbe81 100644
--- a/src/commands/default/system.py
+++ b/src/commands/default/system.py
@@ -13,7 +13,7 @@ import twisted
 from time import time as timemeasure
 
 from django.conf import settings
-from src.server.caches import get_cache_sizes
+#from src.server.caches import get_cache_sizes
 from src.server.sessionhandler import SESSIONS
 from src.scripts.models import ScriptDB
 from src.objects.models import ObjectDB
@@ -678,8 +678,8 @@ class CmdServerLoad(MuxCommand):
         loadavg = os.getloadavg()
         psize = _resource.getpagesize()
         pid = os.getpid()
-        rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1024.0  # resident memory
-        vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1024.0  # virtual memory
+        rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1000.0  # resident memory
+        vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1000.0  # virtual memory
         pmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "%mem")).read())  # percent of resident memory to total
         rusage = resource.getrusage(resource.RUSAGE_SELF)
 
diff --git a/src/scripts/scripts.py b/src/scripts/scripts.py
index f43d8aa627..289a655a83 100644
--- a/src/scripts/scripts.py
+++ b/src/scripts/scripts.py
@@ -19,8 +19,6 @@ __all__ = ["Script", "DoNothing", "CheckSessions",
 
 _GA = object.__getattribute__
 _SESSIONS = None
-# attr-cache size in MB
-_ATTRIBUTE_CACHE_MAXSIZE = settings.ATTRIBUTE_CACHE_MAXSIZE
 
 class ExtendedLoopingCall(LoopingCall):
     """
@@ -155,7 +153,7 @@ class ScriptBase(TypeClass):
     def _step_errback(self, e):
         "callback for runner errors"
         cname = self.__class__.__name__
-        estring = _("Script %(key)s(#%(dbid)i) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
+        estring = _("Script %(key)s(#%(dbid)s) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
                           {"key": self.key, "dbid": self.dbid, "cname": cname,
                            "err": e.getErrorMessage()}
         try:
@@ -538,6 +536,24 @@ class CheckSessions(Script):
         #print "ValidateSessions run"
         _SESSIONS.validate_sessions()
 
+_FLUSH_CACHE = None
+_IDMAPPER_CACHE_MAX_MEMORY = settings.IDMAPPER_CACHE_MAXSIZE
+class ValidateIdmapperCache(Script):
+    """
+    Check memory use of idmapper cache
+    """
+    def at_script_creation(self):
+        self.key = "sys_cache_validate"
+        self.desc = _("Restrains size of idmapper cache.")
+        self.interval = 61 * 5 # staggered compared to session check
+        self.persistent = True
+
+    def at_repeat(self):
+        "Called every ~5 mins"
+        global _FLUSH_CACHE
+        if not _FLUSH_CACHE:
+            from src.utils.idmapper.base import conditional_flush as _FLUSH_CACHE
+        _FLUSH_CACHE(_IDMAPPER_CACHE_MAX_MEMORY)
 
 class ValidateScripts(Script):
     "Check script validation regularly"
diff --git a/src/server/initial_setup.py b/src/server/initial_setup.py
index da71b0e016..031abf56a8 100644
--- a/src/server/initial_setup.py
+++ b/src/server/initial_setup.py
@@ -161,7 +161,10 @@ def create_system_scripts():
     script2 = create.create_script(scripts.ValidateScripts)
     # update the channel handler to make sure it's in sync
     script3 = create.create_script(scripts.ValidateChannelHandler)
-    if not script1 or not script2 or not script3:
+    # flush the idmapper cache
+    script4 = create.create_script(scripts.ValidateIdmapperCache)
+
+    if not script1 or not script2 or not script3 or not script4:
         print " Error creating system scripts."
 
 
diff --git a/src/server/server.py b/src/server/server.py
index 31982d5786..9a0c7f1ee5 100644
--- a/src/server/server.py
+++ b/src/server/server.py
@@ -379,9 +379,6 @@ EVENNIA = Evennia(application)
 print '-' * 50
 print ' %(servername)s Server (%(version)s) started.' % {'servername': SERVERNAME, 'version': VERSION}
 
-if not settings.GAME_CACHE_TYPE:
-    print "  caching disabled"
-
 if AMP_ENABLED:
 
     # The AMP protocol handles the communication between
diff --git a/src/settings_default.py b/src/settings_default.py
index ddfef3d32e..870f294209 100644
--- a/src/settings_default.py
+++ b/src/settings_default.py
@@ -125,18 +125,23 @@ ENCODINGS = ["utf-8", "latin-1", "ISO-8859-1"]
 AMP_HOST = 'localhost'
 AMP_PORT = 5000
 AMP_INTERFACE = '127.0.0.1'
-# Caching speeds up all forms of database access, often considerably. There
-# are (currently) only two settings, "local" or None, the latter of which turns
-# off all caching completely. Local caching stores data in the process. It's
-# very fast but will go out of sync if more than one process writes to the
-# database (such as when using procpool or an extensice web precense).
-GAME_CACHE_TYPE = "local"
-# Attributes on objects are cached aggressively for speed. If the number of
-# objects is large (and their attributes are often accessed) this can use up
-# a lot of memory. So every now and then Evennia checks the size of this
-# cache and resets it if it's too big. This variable sets the maximum
-# size (in MB).
-ATTRIBUTE_CACHE_MAXSIZE = 100
+# Database objects are cached in what is known as the idmapper. The idmapper
+# caching results in a massive speedup of the server (since it dramatically
+# limits the number of database accesses needed) and also allows for
+# storing temporary data on objects. It is however also the main memory
+# consumer of Evennia. With this setting the cache can be capped and
+# flushed when it reaches a certain size.
+# Empirically, N_objects_in_cache ~ (RMEM - 35) / 0.0157.
+#  mem(MB)   |  objs in cache   ||   mem(MB)   |   objs in cache
+#      50    |       ~1000      ||      800    |     ~49 000
+#     100    |       ~4000      ||     1200    |     ~75 000
+#     200    |      ~10 000     ||     1600    |    ~100 000
+#     500    |      ~30 000     ||     2000    |    ~125 000
+# Note that the exact memory usage is not exact, so err on the side of
+# caution if running on a server with limited memory.  How many objects
+# need to be in memory at any given time depends very much on each game
+# and how many players are active. Using None disables the cache cap.
+IDMAPPER_CACHE_MAXSIZE = 200
 
 ######################################################################
 # Evennia Database config
diff --git a/src/utils/dummyrunner/dummyrunner_actions.py b/src/utils/dummyrunner/dummyrunner_actions.py
index 01adc966a7..0955005248 100644
--- a/src/utils/dummyrunner/dummyrunner_actions.py
+++ b/src/utils/dummyrunner/dummyrunner_actions.py
@@ -210,6 +210,4 @@ def c_moves(client):
 ## "heavy digger memory tester" definition
 ACTIONS = (c_login,
            c_logout,
-           (0.1, c_looks),
-           (0.1, c_creates_obj),
-           (0.8, c_digs))
+           (1.0, c_digs))
diff --git a/src/utils/dummyrunner/memplot.py b/src/utils/dummyrunner/memplot.py
index 44a8129aa2..be6af2d2eb 100644
--- a/src/utils/dummyrunner/memplot.py
+++ b/src/utils/dummyrunner/memplot.py
@@ -49,12 +49,12 @@ if __name__ == "__main__":
     vmem = data[:,2]
     nobj = data[:,3]
 
-    # count total amount of objects
-    #ntot = data[:,3].copy()
+    # calculate derivative of obj creation
+    oderiv = (0.5*(nobj[2:] - nobj[:-2]) / (secs[2:] - secs[:-2])).copy()
 
     fig = pp.figure()
     ax1 = fig.add_subplot(111)
-    ax1.set_title("Memory usage")
+    ax1.set_title("Memory usage (200 bots, auto-flush at RMEM ~ 200MB)")
     ax1.set_xlabel("Time (mins)")
     ax1.set_ylabel("Memory usage (MB)")
     ax1.plot(secs, rmem, "r", label="RMEM", lw=2)
@@ -63,9 +63,36 @@ if __name__ == "__main__":
 
     ax2 = ax1.twinx()
     ax2.plot(secs, nobj, "g--", label="objs in cache", lw=2)
-    #ax2.plot(secs, ntot, "r--", label="objs total", lw=2)
+    #ax2.plot(secs[:-2], oderiv/60.0, "g--", label="Objs/second", lw=2)
     ax2.set_ylabel("Number of objects")
     ax2.legend(loc="lower right")
-    ax2.annotate("idmapper\nflush", xy=(70,480))
-    ax2.annotate("@reload", xy=(185,600))
+    #ax2.annotate("All bots\nfinished\nconnecting", xy=(10, 16900))
+    #ax2.annotate("idmapper\nflush", xy=(70,480))
+    #ax2.annotate("@reload", xy=(185,600))
+
+#    # plot mem vs cachesize
+#    nobj, rmem, vmem = nobj[:262].copy(), rmem[:262].copy(), vmem[:262].copy()
+#
+#    fig = pp.figure()
+#    ax1 = fig.add_subplot(111)
+#    ax1.set_title("Memory usage per cache size")
+#    ax1.set_xlabel("Cache size (number of objects)")
+#    ax1.set_ylabel("Memory usage (MB)")
+#    ax1.plot(nobj, rmem, "r", label="RMEM", lw=2)
+#    ax1.plot(nobj, vmem, "b", label="VMEM", lw=2)
+#
+
+##    # empirical estimate of memory usage: rmem = 35.0 + 0.0157 * Ncache
+##    # Ncache = int((rmem - 35.0) / 0.0157)  (rmem in MB)
+#
+#    rderiv_aver = 0.0157
+#    fig = pp.figure()
+#    ax1 = fig.add_subplot(111)
+#    ax1.set_title("Relation between memory and cache size")
+#    ax1.set_xlabel("Memory usage (MB)")
+#    ax1.set_ylabel("Idmapper Cache Size (number of objects)")
+#    rmem = numpy.linspace(35, 2000, 2000)
+#    nobjs = numpy.array([int((mem - 35.0) / 0.0157) for mem in rmem])
+#    ax1.plot(rmem, nobjs, "r", lw=2)
+
     pp.show()
diff --git a/src/utils/idmapper/base.py b/src/utils/idmapper/base.py
index b27d1a325d..2a163692b8 100755
--- a/src/utils/idmapper/base.py
+++ b/src/utils/idmapper/base.py
@@ -7,7 +7,7 @@ leave caching unexpectedly (no use of WeakRefs).
 Also adds cache_size() for monitoring the size of the cache.
 """
 
-import os, threading, gc
+import os, threading, gc, time
 #from twisted.internet import reactor
 #from twisted.internet.threads import blockingCallFromThread
 from weakref import WeakValueDictionary
@@ -15,10 +15,13 @@ from twisted.internet.reactor import callFromThread
 from django.core.exceptions import ObjectDoesNotExist, FieldError
 from django.db.models.base import Model, ModelBase
 from django.db.models.signals import post_save, pre_delete, post_syncdb
+from src.utils import logger
 from src.utils.utils import dbref, get_evennia_pids, to_str
 
 from manager import SharedMemoryManager
 
+AUTO_FLUSH_MIN_INTERVAL = 60.0 * 5 # at least 5 mins between cache flushes
+
 _GA = object.__getattribute__
 _SA = object.__setattr__
 _DA = object.__delattr__
@@ -36,7 +39,6 @@ _SERVER_PID, _PORTAL_PID = get_evennia_pids()
 _IS_SUBPROCESS = (_SERVER_PID and _PORTAL_PID) and not _SELF_PID in (_SERVER_PID, _PORTAL_PID)
 _IS_MAIN_THREAD = threading.currentThread().getName() == "MainThread"
 
-
 class SharedMemoryModelBase(ModelBase):
     # CL: upstream had a __new__ method that skipped ModelBase's __new__ if
     # SharedMemoryModelBase was not in the model class's ancestors. It's not
@@ -225,6 +227,7 @@ class SharedMemoryModel(Model):
         Method to store an instance in the cache.
         """
         if instance._get_pk_val() is not None:
+
             cls.__instance_cache__[instance._get_pk_val()] = instance
     cache_instance = classmethod(cache_instance)
 
@@ -301,6 +304,7 @@ class WeakSharedMemoryModelBase(SharedMemoryModelBase):
         cls.__instance_cache__ = WeakValueDictionary()
         cls._idmapper_recache_protection = False
 
+
 class WeakSharedMemoryModel(SharedMemoryModel):
     """
     Uses a WeakValue dictionary for caching instead of a regular one
@@ -309,6 +313,7 @@ class WeakSharedMemoryModel(SharedMemoryModel):
     class Meta:
         abstract = True
 
+
 def flush_cache(**kwargs):
     """
     Flush idmapper cache. When doing so the cache will
@@ -346,6 +351,7 @@ def flush_cached_instance(sender, instance, **kwargs):
     sender.flush_cached_instance(instance)
 pre_delete.connect(flush_cached_instance)
 
+
 def update_cached_instance(sender, instance, **kwargs):
     """
     Re-cache the given instance in the idmapper cache
@@ -355,6 +361,66 @@ def update_cached_instance(sender, instance, **kwargs):
     sender.cache_instance(instance)
 post_save.connect(update_cached_instance)
 
+
+LAST_FLUSH = None
+def conditional_flush(max_rmem, force=False):
+    """
+    Flush the cache if the estimated memory usage exceeds max_rmem.
+
+    The flusher has a timeout to avoid flushing over and over
+    in particular situations (this means that for some setups
+    the memory usage will exceed the requirement and a server with
+    more memory is probably required for the given game)
+
+    force - forces a flush, regardless of timeout.
+    """
+    global LAST_FLUSH
+
+    def mem2cachesize(desired_rmem):
+        """
+        Estimate the size of the idmapper cache based on the memory
+        desired. This is used to optionally cap the cache size.
+
+        desired_rmem - memory in MB (minimum 50MB)
+
+        The formula is empirically estimated from usage tests (Linux)
+        and is
+            Ncache = RMEM - 35.0 / 0.0157
+        where RMEM is given in MB and Ncache is the size of the cache
+        for this memory usage. VMEM tends to be about 100MB higher
+        than RMEM for large memory usage.
+        """
+        vmem = max(desired_rmem, 50.0)
+        Ncache = int(abs(float(vmem) - 35.0) / 0.0157)
+        return Ncache
+
+    if not max_rmem:
+        # auto-flush is disabled
+        return
+
+    now = time.time()
+    if not LAST_FLUSH:
+        # server is just starting
+        LAST_FLUSH = now
+        return
+
+    if ((now - LAST_FLUSH) < AUTO_FLUSH_MIN_INTERVAL) and not force:
+        # too soon after last flush.
+        logger.log_warnmsg("Warning: Idmapper flush called more than "\
+                            "once in %s min interval. Check memory usage." % (AUTO_FLUSH_MIN_INTERVAL/60.0))
+        return
+
+    # check actual memory usage
+    Ncache_max = mem2cachesize(max_rmem)
+    Ncache, _ = cache_size()
+    actual_rmem = float(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), "rss")).read()) / 1000.0  # resident memory
+
+    if Ncache >= Ncache_max and actual_rmem > max_rmem * 0.9:
+        # flush cache when number of objects in cache is big enough and our
+        # actual memory use is within 10% of our set max
+        flush_cache()
+        LAST_FLUSH = now
+
 def cache_size(mb=True):
     """
     Calculate statistics about the cache.