MAAS 3.5 compatibility fixes

Change-Id: I77d4f6451a09118846c926743aa3bd9638fdee99
Signed-off-by: Sergiy Markin <smarkin@mirantis.com>
This commit is contained in:
Sergiy Markin
2025-09-23 21:49:14 +00:00
parent 5fdc3d1cea
commit 4f940b38e5
9 changed files with 453 additions and 236 deletions

View File

@@ -384,6 +384,20 @@ conf:
threads: 1
workers: 1
drydock:
networkconfig:
# Controls ApplyNetworkConfig subtask, which applies network
# configuration to nodes after they reach the Ready state.
# ApplyNetworkConfig must succeed before MAAS deployment starts.
# In MAAS 3.5, fabrics may intermittently disappear, causing
# minor discrepancies in this process.
# To work around this, ApplyNetworkConfig retries up to
# 'max_retries' times, with random delays between 'retry_min_delay'
# and 'retry_max_delay' seconds.
# This increases the chance of successful network config
# application despite transient MAAS fabric issues.
max_retries: 10
retry_min_delay: 10
retry_max_delay: 60
logging:
log_level: 'DEBUG'
global_logger_name: 'drydock'

View File

@@ -258,6 +258,16 @@
# used to validate tokens that have restricted access rules. (string value)
#service_type = <None>
# Enable the SASL(Simple Authentication and Security Layer) if the SASL_enable
# is true, else disable. (boolean value)
#memcache_sasl_enabled = false
# the user name for the SASL (string value)
#memcache_username =
# the username password for SASL (string value)
#memcache_password =
# Authentication type to load (string value)
# Deprecated group/name - [keystone_authtoken]/auth_plugin
#auth_type = <None>
@@ -344,19 +354,40 @@
#http_client_retries = 3
[networkconfig]
#
# From drydock_provisioner
#
# Maximum number of retries for network configuration (integer value)
#max_retries = 10
# Minimum delay between retries in seconds (integer value)
#retry_min_delay = 10
# Maximum delay between retries in seconds (integer value)
#retry_max_delay = 60
[oslo_policy]
#
# From oslo.policy
#
# This option controls whether or not to enforce scope when evaluating policies.
# If ``True``, the scope of the token used in the request is compared to the
# ``scope_types`` of the policy being enforced. If the scopes do not match, an
# ``InvalidScope`` exception will be raised. If ``False``, a message will be
# logged informing operators that policies are being invoked with mismatching
# scope. (boolean value)
#enforce_scope = false
# DEPRECATED: This option controls whether or not to enforce scope when
# evaluating policies. If ``True``, the scope of the token used in the request
# is compared to the ``scope_types`` of the policy being enforced. If the scopes
# do not match, an ``InvalidScope`` exception will be raised. If ``False``, a
# message will be logged informing operators that policies are being invoked
# with mismatching scope. (boolean value)
# This option is deprecated for removal.
# Its value may be silently ignored in the future.
# Reason: This configuration was added temporarily to facilitate a smooth
# transition to the new RBAC. OpenStack will always enforce scope checks. This
# configuration option is deprecated and will be removed in the 2025.2 cycle.
#enforce_scope = true
# This option controls whether or not to use old deprecated defaults when
# evaluating policies. If ``True``, the old deprecated defaults are not going to
@@ -367,12 +398,12 @@
# deprecated policy check string is logically OR'd with the new policy check
# string, allowing for a graceful upgrade experience between releases with new
# policies, which is the default behavior. (boolean value)
#enforce_new_defaults = false
#enforce_new_defaults = true
# The relative or absolute path of a file that maps roles to permissions for a
# given service. Relative paths must be specified in relation to the
# configuration file setting this option. (string value)
#policy_file = policy.json
#policy_file = policy.yaml
# Default rule. Enforced when a requested rule is not found. (string value)
#policy_default_rule = default
@@ -403,6 +434,10 @@
# Absolute path client key file REST based policy check (string value)
#remote_ssl_client_key_file = <None>
# Timeout in seconds for REST based policy check (floating point value)
# Minimum value: 0
#remote_timeout = 60
[plugins]

View File

@@ -258,6 +258,16 @@
# used to validate tokens that have restricted access rules. (string value)
#service_type = <None>
# Enable the SASL(Simple Authentication and Security Layer) if the SASL_enable
# is true, else disable. (boolean value)
#memcache_sasl_enabled = false
# the user name for the SASL (string value)
#memcache_username =
# the username password for SASL (string value)
#memcache_password =
# Authentication type to load (string value)
# Deprecated group/name - [keystone_authtoken]/auth_plugin
#auth_type = <None>
@@ -344,19 +354,40 @@
#http_client_retries = 3
[networkconfig]
#
# From drydock_provisioner
#
# Maximum number of retries for network configuration (integer value)
#max_retries = 10
# Minimum delay between retries in seconds (integer value)
#retry_min_delay = 10
# Maximum delay between retries in seconds (integer value)
#retry_max_delay = 60
[oslo_policy]
#
# From oslo.policy
#
# This option controls whether or not to enforce scope when evaluating policies.
# If ``True``, the scope of the token used in the request is compared to the
# ``scope_types`` of the policy being enforced. If the scopes do not match, an
# ``InvalidScope`` exception will be raised. If ``False``, a message will be
# logged informing operators that policies are being invoked with mismatching
# scope. (boolean value)
#enforce_scope = false
# DEPRECATED: This option controls whether or not to enforce scope when
# evaluating policies. If ``True``, the scope of the token used in the request
# is compared to the ``scope_types`` of the policy being enforced. If the scopes
# do not match, an ``InvalidScope`` exception will be raised. If ``False``, a
# message will be logged informing operators that policies are being invoked
# with mismatching scope. (boolean value)
# This option is deprecated for removal.
# Its value may be silently ignored in the future.
# Reason: This configuration was added temporarily to facilitate a smooth
# transition to the new RBAC. OpenStack will always enforce scope checks. This
# configuration option is deprecated and will be removed in the 2025.2 cycle.
#enforce_scope = true
# This option controls whether or not to use old deprecated defaults when
# evaluating policies. If ``True``, the old deprecated defaults are not going to
@@ -367,12 +398,12 @@
# deprecated policy check string is logically OR'd with the new policy check
# string, allowing for a graceful upgrade experience between releases with new
# policies, which is the default behavior. (boolean value)
#enforce_new_defaults = false
#enforce_new_defaults = true
# The relative or absolute path of a file that maps roles to permissions for a
# given service. Relative paths must be specified in relation to the
# configuration file setting this option. (string value)
#policy_file = policy.json
#policy_file = policy.yaml
# Default rule. Enforced when a requested rule is not found. (string value)
#policy_default_rule = default
@@ -403,6 +434,10 @@
# Absolute path client key file REST based policy check (string value)
#remote_ssl_client_key_file = <None>
# Timeout in seconds for REST based policy check (floating point value)
# Minimum value: 0
#remote_timeout = 60
[plugins]

View File

@@ -123,6 +123,19 @@ class DrydockConfig(object):
default='http://localhost:9000/api/v1.0/bootactions/')
]
# Options for the apply network configuration framework
networkconfig_options = [
cfg.IntOpt('max_retries',
default=10,
help='Maximum number of retries for network configuration'),
cfg.IntOpt('retry_min_delay',
default=10,
help='Minimum delay between retries in seconds'),
cfg.IntOpt('retry_max_delay',
default=60,
help='Maximum delay between retries in seconds')
]
# Options for network traffic
network_options = [
cfg.IntOpt(
@@ -228,6 +241,8 @@ class DrydockConfig(object):
def register_options(self, enable_keystone=True):
self.conf.register_opts(DrydockConfig.options)
self.conf.register_opts(DrydockConfig.networkconfig_options,
group='networkconfig')
self.conf.register_opts(DrydockConfig.bootactions_options,
group='bootactions')
self.conf.register_opts(DrydockConfig.logging_options, group='logging')
@@ -254,6 +269,7 @@ def list_opts():
'timeouts': DrydockConfig.timeout_options,
'database': DrydockConfig.database_options,
'network': DrydockConfig.network_options,
'networkconfig': DrydockConfig.networkconfig_options,
}
package_path = os.path.dirname(os.path.abspath(__file__))

View File

@@ -14,6 +14,7 @@
"""Task driver for completing node provisioning with Canonical MaaS 2.2+."""
import time
import secrets # Use secrets for cryptographic random numbers
import logging
import re
import math
@@ -45,7 +46,6 @@ import drydock_provisioner.drivers.node.maasdriver.models.volumegroup as maas_vg
import drydock_provisioner.drivers.node.maasdriver.models.repository as maas_repo
import drydock_provisioner.drivers.node.maasdriver.models.domain as maas_domain
class BaseMaasAction(BaseAction):
def __init__(self, *args, maas_client=None):
@@ -1375,6 +1375,9 @@ class ApplyNodeNetworking(BaseMaasAction):
"""Action to configure networking on a node."""
def start(self):
MAX_RETRIES = config.config_mgr.conf.networkconfig.max_retries
RETRY_MIN_DELAY = config.config_mgr.conf.networkconfig.retry_min_delay
RETRY_MAX_DELAY = config.config_mgr.conf.networkconfig.retry_max_delay
try:
maas_machine.Machines(self.maas_client).empty_refresh()
@@ -1461,6 +1464,12 @@ class ApplyNodeNetworking(BaseMaasAction):
ctx_type='node')
if machine.status_name == 'Ready':
for attempt in range(MAX_RETRIES):
try:
self.logger.debug(
"Attempting to configure networking for node %s (Attempt %d)" % (
n.name, attempt + 1))
msg = "Located node %s in MaaS, starting interface configuration" % (
n.name)
self.logger.debug(msg)
@@ -1469,9 +1478,13 @@ class ApplyNodeNetworking(BaseMaasAction):
ctx=n.name,
ctx_type='node')
self.logger.debug("Reset network config for node %s." % (n.name))
machine.reset_network_config()
self.logger.debug("Refreshing machine %s." % (n.name))
machine.refresh()
self.logger.debug("Refreshing fabric and subnets list.")
fabrics.refresh()
subnets.refresh()
for i in n.interfaces:
if not i.network_link:
self.logger.debug(
@@ -1488,6 +1501,8 @@ class ApplyNodeNetworking(BaseMaasAction):
% (i.device_name, nl.name))
continue
self.logger.debug("Refreshing fabric list.")
fabrics.refresh()
fabric = fabrics.singleton({'name': nl.name})
if fabric is None:
@@ -1506,15 +1521,53 @@ class ApplyNodeNetworking(BaseMaasAction):
msg = "Building node %s interface %s as a bond." % (
n.name, i.device_name)
self.logger.debug(msg)
self.logger.debug(
"Adding status message for node %s: %s" % (n.name, msg))
self.task.add_status_msg(msg=msg,
error=False,
ctx=n.name,
ctx_type='node')
self.logger.debug(
"Fetching hardware slaves for interface %s on node %s." % (
i.device_name, n.name))
hw_iface_list = i.get_hw_slaves()
self.logger.debug("Hardware slaves fetched: %s" % hw_iface_list)
hw_iface_logicalname_list = []
for hw_iface in hw_iface_list:
hw_iface_logicalname_list.append(
n.get_logicalname(hw_iface))
logical_name = n.get_logicalname(hw_iface)
self.logger.debug(
"Mapping hardware slave %s to logical name %s." % (
hw_iface, logical_name))
hw_iface_logicalname_list.append(logical_name)
msg = """Creating bond interface with parameters:
device_name=%s,
parent_names=%s,
mtu=%s,
fabric=%s,
mode=%s,
monitor_interval=%s,
downdelay=%s,
updelay=%s,
lacp_rate=%s,
hash_policy=%s""" % (
i.device_name,
hw_iface_logicalname_list,
nl.mtu,
fabric.resource_id,
nl.bonding_mode,
nl.bonding_mon_rate,
nl.bonding_down_delay,
nl.bonding_up_delay,
nl.bonding_peer_rate,
nl.bonding_xmit_hash)
self.logger.debug(msg)
self.task.add_status_msg(msg=msg,
error=False,
ctx=n.name,
ctx_type='node')
iface = machine.interfaces.create_bond(
device_name=i.device_name,
parent_names=hw_iface_logicalname_list,
@@ -1539,7 +1592,8 @@ class ApplyNodeNetworking(BaseMaasAction):
continue
else:
if len(i.get_hw_slaves()) > 1:
msg = "Network link %s disables bonding, interface %s has multiple slaves." % \
msg = """Network link %s disables bonding,
interface %s has multiple slaves.""" % \
(nl.name, i.device_name)
self.logger.warning(msg)
self.task.add_status_msg(msg=msg,
@@ -1586,12 +1640,22 @@ class ApplyNodeNetworking(BaseMaasAction):
"Interface %s already attached to fabric_id %s"
% (i.device_name, fabric.resource_id))
else:
# Refresh the list of fabrics
self.logger.debug("Refreshing fabric list before attaching fabric.")
fabrics.refresh()
# Validate the fabric_id
if not fabrics.contains(fabric.resource_id):
msg = "Fabric ID %s does not exist "
"in the refreshed list of fabrics." % fabric.resource_id
self.logger.error(msg)
raise errors.DriverError(msg)
# Proceed with attaching the fabric
self.logger.debug(
"Attaching node %s interface %s to fabric_id %s"
% (n.name, i.device_name,
fabric.resource_id))
iface.attach_fabric(
fabric_id=fabric.resource_id)
"Attaching node %s interface %s to fabric_id %s" % (
n.name, i.device_name, fabric.resource_id))
iface.attach_fabric(fabric_id=fabric.resource_id)
if iface.effective_mtu != nl.mtu:
self.logger.debug(
@@ -1610,8 +1674,9 @@ class ApplyNodeNetworking(BaseMaasAction):
# then the interface itself should be linked to network, not a VLAN
# tagged interface
self.logger.debug(
"Attaching node %s interface %s to untagged VLAN on fabric %s"
% (n.name, i.device_name,
"Attaching node %s interface %s to untagged VLAN on fabric %s" % (
n.name,
i.device_name,
fabric.resource_id))
link_iface = iface
else:
@@ -1626,8 +1691,10 @@ class ApplyNodeNetworking(BaseMaasAction):
vlan_options['mtu'] = dd_net.mtu
self.logger.debug(
"Creating tagged interface for VLAN %s on system %s interface %s"
% (dd_net.vlan_id, n.name,
"""Creating tagged interface for
VLAN %s on system %s interface %s""" % (
dd_net.vlan_id,
n.name,
i.device_name))
try:
@@ -1660,7 +1727,8 @@ class ApplyNodeNetworking(BaseMaasAction):
found = True
if not found:
msg = "No addressed assigned to network %s for node %s, link is L2 only." % (
msg = """No addressed assigned to network %s
for node %s, link is L2 only.""" % (
iface_net, n.name)
self.logger.info(msg)
self.task.add_status_msg(
@@ -1682,12 +1750,25 @@ class ApplyNodeNetworking(BaseMaasAction):
self.task.success(focus=n.name)
else:
self.task.failure(focus=n.name)
msg = "Did not find a defined Network %s to attach to interface" % iface_net
msg = """Did not find a defined
Network %s to attach to interface""" % iface_net
self.logger.error(msg)
self.task.add_status_msg(msg=msg,
error=True,
ctx=n.name,
ctx_type='node')
break # Exit retry loop if successful
except Exception as ex:
self.logger.warning(
"Attempt %d failed for node %s: %s" % (attempt + 1, n.name, str(ex)))
if attempt < MAX_RETRIES - 1:
# Wait for a random time between 10 and 300 seconds
random_delay = RETRY_MIN_DELAY + \
secrets.randbelow(
RETRY_MAX_DELAY - RETRY_MIN_DELAY + 1)
self.logger.debug(
f"Waiting for {random_delay} seconds before retrying...")
time.sleep(random_delay)
elif machine.status_name == 'Broken':
msg = (
"Located node %s in MaaS, status broken. Run "
@@ -1727,7 +1808,7 @@ class ApplyNodeNetworking(BaseMaasAction):
error=True,
ctx=n.name,
ctx_type='node')
self.task.failure()
self.task.failure(focus=n.name)
except Exception as ex:
msg = "Error configuring network for node %s: %s" % (n.name,
str(ex))

View File

@@ -157,9 +157,17 @@ class ResourceBase(object):
@classmethod
def from_dict(cls, api_client, obj_dict):
"""
Create an instance of this resource class based on a dict
of MaaS type attributes.
"""
refined_dict = {k: obj_dict.get(k, None) for k in cls.fields}
if 'id' in obj_dict.keys():
refined_dict['resource_id'] = obj_dict.get('id')
refined_dict['resource_id'] = obj_dict.get('id') # Map 'id' to 'resource_id'
# Ensure 'resource_id' is set correctly
if refined_dict.get('resource_id') is None:
raise ValueError("Missing 'resource_id' in object dictionary.")
i = cls(api_client, **refined_dict)
return i

View File

@@ -25,9 +25,13 @@ class Fabric(model_base.ResourceBase):
def __init__(self, api_client, **kwargs):
super(Fabric, self).__init__(api_client, **kwargs)
self.logger.debug(f"Initializing Fabric with resource_id: {self.resource_id}")
if getattr(self, 'resource_id', None):
# Explicitly check if resource_id is not None
if getattr(self, 'resource_id', None) is not None:
self.refresh_vlans()
else:
self.logger.warning("Cannot refresh VLANs for Fabric without resource_id.")
def refresh(self):
super(Fabric, self).refresh()

View File

@@ -60,14 +60,21 @@ class Interface(model_base.ResourceBase):
:param fabric_id: The MaaS resource ID of a network Fabric to connect to
:param fabric_name: The name of a MaaS fabric to connect to
"""
self.logger.debug(
"Starting attach_fabric with parameters: fabric_id=%s, fabric_name=%s" % (
fabric_id, fabric_name))
fabric = None
fabrics = maas_fabric.Fabrics(self.api_client)
self.logger.debug("Refreshing fabric list.")
fabrics.refresh()
if fabric_id is not None:
self.logger.debug("Looking for fabric with ID: %s" % fabric_id)
fabric = fabrics.select(fabric_id)
elif fabric_name is not None:
self.logger.debug("Looking for fabric with name: %s" % fabric_name)
fabric = fabrics.singleton({'name': fabric_name})
else:
self.logger.warning("Must specify fabric_id or fabric_name")
@@ -81,21 +88,38 @@ class Interface(model_base.ResourceBase):
"Fabric not found in MaaS for fabric_id %s, fabric_name %s" %
(fabric_id, fabric_name))
self.logger.debug("Found fabric: %s" % fabric.resource_id)
self.logger.debug("Refreshing VLAN list for fabric %s." % fabric.resource_id)
if hasattr(fabric, 'vlans'):
fabric.vlans.refresh()
else:
self.logger.error(
"Fabric object has no attribute 'vlans'. Type: %s, value: %s" % (
type(fabric), str(fabric)))
raise errors.DriverError("Fabric object has no attribute 'vlans'")
# Locate the untagged VLAN for this fabric.
self.logger.debug("Looking for untagged VLAN (vid=0) on fabric %s" % fabric.resource_id)
fabric_vlan = fabric.vlans.singleton({'vid': 0})
if fabric_vlan is None:
self.logger.warning("Cannot locate untagged VLAN on fabric %s" %
(fabric_id))
(fabric.resource_id))
raise errors.DriverError(
"Cannot locate untagged VLAN on fabric %s" % (fabric_id))
"Cannot locate untagged VLAN on fabric %s" % (fabric.resource_id))
self.logger.debug("Found untagged VLAN: %s" % fabric_vlan.resource_id)
self.vlan = fabric_vlan.resource_id
self.logger.info(
"Attaching interface %s on system %s to VLAN %s on fabric %s" %
(self.resource_id, self.system_id, fabric_vlan.resource_id,
fabric.resource_id))
self.logger.debug("Updating interface with new VLAN configuration.")
self.update()
self.logger.debug("Interface %s successfully attached to fabric %s." % (self.resource_id, fabric.resource_id))
def is_linked(self, subnet_id):
"""Check if this interface is linked to the given subnet.

View File

@@ -47,7 +47,7 @@ class Machine(model_base.ResourceBase):
super(Machine, self).__init__(api_client, **kwargs)
# Replace generic dicts with interface collection model
if getattr(self, 'resource_id', None):
if getattr(self, 'resource_id', None) is not None:
self.interfaces = maas_interface.Interfaces(
api_client, system_id=self.resource_id)
self.interfaces.refresh()