Added periodic idmapper cache size check along with conditional flush mechanism. Ran a lot of tests and stress tests to get statistics on usage.

This commit is contained in:
Griatch 2014-05-18 18:28:10 +02:00
parent a143733ccf
commit f2c75bd0f6
8 changed files with 145 additions and 33 deletions

View file

@ -13,7 +13,7 @@ import twisted
from time import time as timemeasure
from django.conf import settings
from src.server.caches import get_cache_sizes
#from src.server.caches import get_cache_sizes
from src.server.sessionhandler import SESSIONS
from src.scripts.models import ScriptDB
from src.objects.models import ObjectDB
@ -678,8 +678,8 @@ class CmdServerLoad(MuxCommand):
loadavg = os.getloadavg()
psize = _resource.getpagesize()
pid = os.getpid()
rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1024.0 # resident memory
vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1024.0 # virtual memory
rmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "rss")).read()) / 1000.0 # resident memory
vmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "vsz")).read()) / 1000.0 # virtual memory
pmem = float(os.popen('ps -p %d -o %s | tail -1' % (pid, "%mem")).read()) # percent of resident memory to total
rusage = resource.getrusage(resource.RUSAGE_SELF)

View file

@ -19,8 +19,6 @@ __all__ = ["Script", "DoNothing", "CheckSessions",
_GA = object.__getattribute__
_SESSIONS = None
# attr-cache size in MB
_ATTRIBUTE_CACHE_MAXSIZE = settings.ATTRIBUTE_CACHE_MAXSIZE
class ExtendedLoopingCall(LoopingCall):
"""
@ -155,7 +153,7 @@ class ScriptBase(TypeClass):
def _step_errback(self, e):
"callback for runner errors"
cname = self.__class__.__name__
estring = _("Script %(key)s(#%(dbid)i) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
estring = _("Script %(key)s(#%(dbid)s) of type '%(cname)s': at_repeat() error '%(err)s'.") % \
{"key": self.key, "dbid": self.dbid, "cname": cname,
"err": e.getErrorMessage()}
try:
@ -538,6 +536,24 @@ class CheckSessions(Script):
#print "ValidateSessions run"
_SESSIONS.validate_sessions()
_FLUSH_CACHE = None
_IDMAPPER_CACHE_MAX_MEMORY = settings.IDMAPPER_CACHE_MAXSIZE
class ValidateIdmapperCache(Script):
"""
Check memory use of idmapper cache
"""
def at_script_creation(self):
self.key = "sys_cache_validate"
self.desc = _("Restrains size of idmapper cache.")
self.interval = 61 * 5 # staggered compared to session check
self.persistent = True
def at_repeat(self):
"Called every ~5 mins"
global _FLUSH_CACHE
if not _FLUSH_CACHE:
from src.utils.idmapper.base import conditional_flush as _FLUSH_CACHE
_FLUSH_CACHE(_IDMAPPER_CACHE_MAX_MEMORY)
class ValidateScripts(Script):
"Check script validation regularly"

View file

@ -161,7 +161,10 @@ def create_system_scripts():
script2 = create.create_script(scripts.ValidateScripts)
# update the channel handler to make sure it's in sync
script3 = create.create_script(scripts.ValidateChannelHandler)
if not script1 or not script2 or not script3:
# flush the idmapper cache
script4 = create.create_script(scripts.ValidateIdmapperCache)
if not script1 or not script2 or not script3 or not script4:
print " Error creating system scripts."

View file

@ -379,9 +379,6 @@ EVENNIA = Evennia(application)
print '-' * 50
print ' %(servername)s Server (%(version)s) started.' % {'servername': SERVERNAME, 'version': VERSION}
if not settings.GAME_CACHE_TYPE:
print " caching disabled"
if AMP_ENABLED:
# The AMP protocol handles the communication between

View file

@ -125,18 +125,23 @@ ENCODINGS = ["utf-8", "latin-1", "ISO-8859-1"]
AMP_HOST = 'localhost'
AMP_PORT = 5000
AMP_INTERFACE = '127.0.0.1'
# Caching speeds up all forms of database access, often considerably. There
# are (currently) only two settings, "local" or None, the latter of which turns
# off all caching completely. Local caching stores data in the process. It's
# very fast but will go out of sync if more than one process writes to the
# database (such as when using procpool or an extensice web precense).
GAME_CACHE_TYPE = "local"
# Attributes on objects are cached aggressively for speed. If the number of
# objects is large (and their attributes are often accessed) this can use up
# a lot of memory. So every now and then Evennia checks the size of this
# cache and resets it if it's too big. This variable sets the maximum
# size (in MB).
ATTRIBUTE_CACHE_MAXSIZE = 100
# Database objects are cached in what is known as the idmapper. The idmapper
# caching results in a massive speedup of the server (since it dramatically
# limits the number of database accesses needed) and also allows for
# storing temporary data on objects. It is however also the main memory
# consumer of Evennia. With this setting the cache can be capped and
# flushed when it reaches a certain size.
# Empirically, N_objects_in_cache ~ (RMEM - 35) / 0.0157.
# mem(MB) | objs in cache || mem(MB) | objs in cache
# 50 | ~1000 || 800 | ~49 000
# 100 | ~4000 || 1200 | ~75 000
# 200 | ~10 000 || 1600 | ~100 000
# 500 | ~30 000 || 2000 | ~125 000
# Note that the exact memory usage is not exact, so err on the side of
# caution if running on a server with limited memory. How many objects
# need to be in memory at any given time depends very much on each game
# and how many players are active. Using None disables the cache cap.
IDMAPPER_CACHE_MAXSIZE = 200
######################################################################
# Evennia Database config

View file

@ -210,6 +210,4 @@ def c_moves(client):
## "heavy digger memory tester" definition
ACTIONS = (c_login,
c_logout,
(0.1, c_looks),
(0.1, c_creates_obj),
(0.8, c_digs))
(1.0, c_digs))

View file

@ -49,12 +49,12 @@ if __name__ == "__main__":
vmem = data[:,2]
nobj = data[:,3]
# count total amount of objects
#ntot = data[:,3].copy()
# calculate derivative of obj creation
oderiv = (0.5*(nobj[2:] - nobj[:-2]) / (secs[2:] - secs[:-2])).copy()
fig = pp.figure()
ax1 = fig.add_subplot(111)
ax1.set_title("Memory usage")
ax1.set_title("Memory usage (200 bots, auto-flush at RMEM ~ 200MB)")
ax1.set_xlabel("Time (mins)")
ax1.set_ylabel("Memory usage (MB)")
ax1.plot(secs, rmem, "r", label="RMEM", lw=2)
@ -63,9 +63,36 @@ if __name__ == "__main__":
ax2 = ax1.twinx()
ax2.plot(secs, nobj, "g--", label="objs in cache", lw=2)
#ax2.plot(secs, ntot, "r--", label="objs total", lw=2)
#ax2.plot(secs[:-2], oderiv/60.0, "g--", label="Objs/second", lw=2)
ax2.set_ylabel("Number of objects")
ax2.legend(loc="lower right")
ax2.annotate("idmapper\nflush", xy=(70,480))
ax2.annotate("@reload", xy=(185,600))
#ax2.annotate("All bots\nfinished\nconnecting", xy=(10, 16900))
#ax2.annotate("idmapper\nflush", xy=(70,480))
#ax2.annotate("@reload", xy=(185,600))
# # plot mem vs cachesize
# nobj, rmem, vmem = nobj[:262].copy(), rmem[:262].copy(), vmem[:262].copy()
#
# fig = pp.figure()
# ax1 = fig.add_subplot(111)
# ax1.set_title("Memory usage per cache size")
# ax1.set_xlabel("Cache size (number of objects)")
# ax1.set_ylabel("Memory usage (MB)")
# ax1.plot(nobj, rmem, "r", label="RMEM", lw=2)
# ax1.plot(nobj, vmem, "b", label="VMEM", lw=2)
#
## # empirical estimate of memory usage: rmem = 35.0 + 0.0157 * Ncache
## # Ncache = int((rmem - 35.0) / 0.0157) (rmem in MB)
#
# rderiv_aver = 0.0157
# fig = pp.figure()
# ax1 = fig.add_subplot(111)
# ax1.set_title("Relation between memory and cache size")
# ax1.set_xlabel("Memory usage (MB)")
# ax1.set_ylabel("Idmapper Cache Size (number of objects)")
# rmem = numpy.linspace(35, 2000, 2000)
# nobjs = numpy.array([int((mem - 35.0) / 0.0157) for mem in rmem])
# ax1.plot(rmem, nobjs, "r", lw=2)
pp.show()

View file

@ -7,7 +7,7 @@ leave caching unexpectedly (no use of WeakRefs).
Also adds cache_size() for monitoring the size of the cache.
"""
import os, threading, gc
import os, threading, gc, time
#from twisted.internet import reactor
#from twisted.internet.threads import blockingCallFromThread
from weakref import WeakValueDictionary
@ -15,10 +15,13 @@ from twisted.internet.reactor import callFromThread
from django.core.exceptions import ObjectDoesNotExist, FieldError
from django.db.models.base import Model, ModelBase
from django.db.models.signals import post_save, pre_delete, post_syncdb
from src.utils import logger
from src.utils.utils import dbref, get_evennia_pids, to_str
from manager import SharedMemoryManager
AUTO_FLUSH_MIN_INTERVAL = 60.0 * 5 # at least 5 mins between cache flushes
_GA = object.__getattribute__
_SA = object.__setattr__
_DA = object.__delattr__
@ -36,7 +39,6 @@ _SERVER_PID, _PORTAL_PID = get_evennia_pids()
_IS_SUBPROCESS = (_SERVER_PID and _PORTAL_PID) and not _SELF_PID in (_SERVER_PID, _PORTAL_PID)
_IS_MAIN_THREAD = threading.currentThread().getName() == "MainThread"
class SharedMemoryModelBase(ModelBase):
# CL: upstream had a __new__ method that skipped ModelBase's __new__ if
# SharedMemoryModelBase was not in the model class's ancestors. It's not
@ -225,6 +227,7 @@ class SharedMemoryModel(Model):
Method to store an instance in the cache.
"""
if instance._get_pk_val() is not None:
cls.__instance_cache__[instance._get_pk_val()] = instance
cache_instance = classmethod(cache_instance)
@ -301,6 +304,7 @@ class WeakSharedMemoryModelBase(SharedMemoryModelBase):
cls.__instance_cache__ = WeakValueDictionary()
cls._idmapper_recache_protection = False
class WeakSharedMemoryModel(SharedMemoryModel):
"""
Uses a WeakValue dictionary for caching instead of a regular one
@ -309,6 +313,7 @@ class WeakSharedMemoryModel(SharedMemoryModel):
class Meta:
abstract = True
def flush_cache(**kwargs):
"""
Flush idmapper cache. When doing so the cache will
@ -346,6 +351,7 @@ def flush_cached_instance(sender, instance, **kwargs):
sender.flush_cached_instance(instance)
pre_delete.connect(flush_cached_instance)
def update_cached_instance(sender, instance, **kwargs):
"""
Re-cache the given instance in the idmapper cache
@ -355,6 +361,66 @@ def update_cached_instance(sender, instance, **kwargs):
sender.cache_instance(instance)
post_save.connect(update_cached_instance)
LAST_FLUSH = None
def conditional_flush(max_rmem, force=False):
"""
Flush the cache if the estimated memory usage exceeds max_rmem.
The flusher has a timeout to avoid flushing over and over
in particular situations (this means that for some setups
the memory usage will exceed the requirement and a server with
more memory is probably required for the given game)
force - forces a flush, regardless of timeout.
"""
global LAST_FLUSH
def mem2cachesize(desired_rmem):
"""
Estimate the size of the idmapper cache based on the memory
desired. This is used to optionally cap the cache size.
desired_rmem - memory in MB (minimum 50MB)
The formula is empirically estimated from usage tests (Linux)
and is
Ncache = RMEM - 35.0 / 0.0157
where RMEM is given in MB and Ncache is the size of the cache
for this memory usage. VMEM tends to be about 100MB higher
than RMEM for large memory usage.
"""
vmem = max(desired_rmem, 50.0)
Ncache = int(abs(float(vmem) - 35.0) / 0.0157)
return Ncache
if not max_rmem:
# auto-flush is disabled
return
now = time.time()
if not LAST_FLUSH:
# server is just starting
LAST_FLUSH = now
return
if ((now - LAST_FLUSH) < AUTO_FLUSH_MIN_INTERVAL) and not force:
# too soon after last flush.
logger.log_warnmsg("Warning: Idmapper flush called more than "\
"once in %s min interval. Check memory usage." % (AUTO_FLUSH_MIN_INTERVAL/60.0))
return
# check actual memory usage
Ncache_max = mem2cachesize(max_rmem)
Ncache, _ = cache_size()
actual_rmem = float(os.popen('ps -p %d -o %s | tail -1' % (os.getpid(), "rss")).read()) / 1000.0 # resident memory
if Ncache >= Ncache_max and actual_rmem > max_rmem * 0.9:
# flush cache when number of objects in cache is big enough and our
# actual memory use is within 10% of our set max
flush_cache()
LAST_FLUSH = now
def cache_size(mb=True):
"""
Calculate statistics about the cache.