Fix memory leak related to cached lookups
* It turned out that Mistral used a lot of memory because it used cached DB lookups for task executions (only task executions in a terminal state get cached) and the maximum size of the cache was too big, 20000 entries. One task execution in certain cases may take a lot of memory (e.g. several megabytes) so 20000 objects make memory footprint huge. Additionally, when a workflow completes we need to invalidate coresponding task executions in the cache. This didn't happen before this patch. * This patch fixes the aforementioned issues by using partial invalidation of the cache and setting smaller cache size. * Fixed "Starting workflow .." log statement to not print the entire structure of the workflow definition into the workflow log, only its name and input parameters * Minor style fixes Change-Id: I0ee300f631a4bdfa2f618c2a10048267f27b3345 Closes-bug: #1664864
This commit is contained in:
@@ -354,7 +354,7 @@ class ActionExecution(resource.Resource):
|
||||
output = types.jsontype
|
||||
created_at = wtypes.text
|
||||
updated_at = wtypes.text
|
||||
params = types.jsontype
|
||||
params = types.jsontype # TODO(rakhmerov): What is this??
|
||||
|
||||
@classmethod
|
||||
def sample(cls):
|
||||
|
@@ -38,7 +38,6 @@ class DefaultEngine(base.Engine):
|
||||
@profiler.trace('engine-start-workflow')
|
||||
def start_workflow(self, wf_identifier, wf_input, description='',
|
||||
**params):
|
||||
|
||||
with db_api.transaction():
|
||||
wf_ex = wf_handler.start_workflow(
|
||||
wf_identifier,
|
||||
|
@@ -81,7 +81,11 @@ class Workflow(object):
|
||||
|
||||
assert not self.wf_ex
|
||||
|
||||
wf_trace.info(self.wf_ex, "Starting workflow: %s" % self.wf_def)
|
||||
wf_trace.info(
|
||||
self.wf_ex,
|
||||
"Starting workflow [name=%s, input=%s]" %
|
||||
(self.wf_def.name, utils.cut(input_dict))
|
||||
)
|
||||
|
||||
# TODO(rakhmerov): This call implicitly changes input_dict! Fix it!
|
||||
# After fix we need to move validation after adding risky fields.
|
||||
@@ -147,7 +151,7 @@ class Workflow(object):
|
||||
# Since some lookup utils functions may use cache for completed tasks
|
||||
# we need to clean caches to make sure that stale objects can't be
|
||||
# retrieved.
|
||||
lookup_utils.clean_caches()
|
||||
lookup_utils.clear_caches()
|
||||
|
||||
wf_service.update_workflow_execution_env(self.wf_ex, env)
|
||||
|
||||
@@ -258,6 +262,11 @@ class Workflow(object):
|
||||
# only if it completed successfully or failed.
|
||||
self.wf_ex.accepted = states.is_completed(state)
|
||||
|
||||
if states.is_completed(state):
|
||||
# No need to keep task executions of this workflow in the
|
||||
# lookup cache anymore.
|
||||
lookup_utils.invalidate_cached_task_executions(self.wf_ex.id)
|
||||
|
||||
if recursive and self.wf_ex.task_execution_id:
|
||||
parent_task_ex = db_api.get_task_execution(
|
||||
self.wf_ex.task_execution_id
|
||||
|
@@ -248,7 +248,7 @@ class DbTestCase(BaseTest):
|
||||
action_manager.sync_db()
|
||||
|
||||
def _clean_db(self):
|
||||
lookup_utils.clean_caches()
|
||||
lookup_utils.clear_caches()
|
||||
|
||||
contexts = [
|
||||
get_context(default=False),
|
||||
|
82
mistral/tests/unit/engine/test_lookup_utils.py
Normal file
82
mistral/tests/unit/engine/test_lookup_utils.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# Copyright 2017 - Nokia Networks.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
from mistral.db.v2 import api as db_api
|
||||
from mistral.services import workflows as wf_service
|
||||
from mistral.tests.unit.engine import base
|
||||
from mistral.workflow import lookup_utils
|
||||
from mistral.workflow import states
|
||||
|
||||
# Use the set_default method to set value otherwise in certain test cases
|
||||
# the change in value is not permanent.
|
||||
cfg.CONF.set_default('auth_enable', False, group='pecan')
|
||||
|
||||
|
||||
class LookupUtilsTest(base.EngineTestCase):
|
||||
def test_task_execution_cache_invalidation(self):
|
||||
wf_text = """---
|
||||
version: '2.0'
|
||||
|
||||
wf:
|
||||
tasks:
|
||||
task1:
|
||||
action: std.noop
|
||||
on-success: join_task
|
||||
|
||||
task2:
|
||||
action: std.noop
|
||||
on-success: join_task
|
||||
|
||||
join_task:
|
||||
join: all
|
||||
on-success: task4
|
||||
|
||||
task4:
|
||||
action: std.noop
|
||||
pause-before: true
|
||||
"""
|
||||
|
||||
wf_service.create_workflows(wf_text)
|
||||
|
||||
# Start workflow.
|
||||
wf_ex = self.engine.start_workflow('wf', {})
|
||||
|
||||
self.await_workflow_paused(wf_ex.id)
|
||||
|
||||
with db_api.transaction():
|
||||
# Note: We need to reread execution to access related tasks.
|
||||
wf_ex = db_api.get_workflow_execution(wf_ex.id)
|
||||
|
||||
tasks = wf_ex.task_executions
|
||||
|
||||
self.assertEqual(4, len(tasks))
|
||||
|
||||
self._assert_single_item(tasks, name='task1', state=states.SUCCESS)
|
||||
self._assert_single_item(tasks, name='task2', state=states.SUCCESS)
|
||||
self._assert_single_item(tasks, name='join_task', state=states.SUCCESS)
|
||||
self._assert_single_item(tasks, name='task4', state=states.IDLE)
|
||||
|
||||
# Expecting one cache entry because we know that 'join' operation
|
||||
# uses cached lookups and the workflow is not finished yet.
|
||||
self.assertEqual(1, lookup_utils.get_task_execution_cache_size())
|
||||
|
||||
self.engine.resume_workflow(wf_ex.id)
|
||||
|
||||
self.await_workflow_success(wf_ex.id)
|
||||
|
||||
# Expecting that the cache size is 0 because the workflow has
|
||||
# finished and invalidated corresponding cache entry.
|
||||
self.assertEqual(0, lookup_utils.get_task_execution_cache_size())
|
@@ -35,8 +35,21 @@ import threading
|
||||
from mistral.db.v2 import api as db_api
|
||||
from mistral.workflow import states
|
||||
|
||||
_TASK_EXECUTIONS_CACHE_LOCK = threading.RLock()
|
||||
_TASK_EXECUTIONS_CACHE = cachetools.LRUCache(maxsize=20000)
|
||||
|
||||
def _create_lru_cache_for_workflow_execution(wf_ex_id):
|
||||
return cachetools.LRUCache(maxsize=500)
|
||||
|
||||
# This is a two-level caching structure.
|
||||
# First level: [<workflow execution id> -> <task execution cache>]
|
||||
# Second level (task execution cache): [<task_name> -> <task executions>]
|
||||
# The first level (by workflow execution id) allows to invalidate
|
||||
# needed cache entry when the workflow gets completed.
|
||||
_TASK_EX_CACHE = cachetools.LRUCache(
|
||||
maxsize=100,
|
||||
missing=_create_lru_cache_for_workflow_execution
|
||||
)
|
||||
|
||||
_CACHE_LOCK = threading.RLock()
|
||||
|
||||
|
||||
def find_task_executions_by_name(wf_ex_id, task_name):
|
||||
@@ -46,10 +59,8 @@ def find_task_executions_by_name(wf_ex_id, task_name):
|
||||
:param task_name: Task name.
|
||||
:return: Task executions (possibly a cached value).
|
||||
"""
|
||||
cache_key = (wf_ex_id, task_name)
|
||||
|
||||
with _TASK_EXECUTIONS_CACHE_LOCK:
|
||||
t_execs = _TASK_EXECUTIONS_CACHE.get(cache_key)
|
||||
with _CACHE_LOCK:
|
||||
t_execs = _TASK_EX_CACHE[wf_ex_id].get(task_name)
|
||||
|
||||
if t_execs:
|
||||
return t_execs
|
||||
@@ -66,8 +77,8 @@ def find_task_executions_by_name(wf_ex_id, task_name):
|
||||
)
|
||||
|
||||
if all_finished:
|
||||
with _TASK_EXECUTIONS_CACHE_LOCK:
|
||||
_TASK_EXECUTIONS_CACHE[cache_key] = t_execs
|
||||
with _CACHE_LOCK:
|
||||
_TASK_EX_CACHE[wf_ex_id][task_name] = t_execs
|
||||
|
||||
return t_execs
|
||||
|
||||
@@ -108,6 +119,16 @@ def find_completed_tasks(wf_ex_id):
|
||||
return db_api.get_completed_task_executions(workflow_execution_id=wf_ex_id)
|
||||
|
||||
|
||||
def clean_caches():
|
||||
with _TASK_EXECUTIONS_CACHE_LOCK:
|
||||
_TASK_EXECUTIONS_CACHE.clear()
|
||||
def get_task_execution_cache_size():
|
||||
return len(_TASK_EX_CACHE)
|
||||
|
||||
|
||||
def invalidate_cached_task_executions(wf_ex_id):
|
||||
with _CACHE_LOCK:
|
||||
if wf_ex_id in _TASK_EX_CACHE:
|
||||
del _TASK_EX_CACHE[wf_ex_id]
|
||||
|
||||
|
||||
def clear_caches():
|
||||
with _CACHE_LOCK:
|
||||
_TASK_EX_CACHE.clear()
|
||||
|
Reference in New Issue
Block a user