From 4f940b38e5bf1373c3bb2a1041268fa05f996747 Mon Sep 17 00:00:00 2001 From: Sergiy Markin Date: Tue, 23 Sep 2025 21:49:14 +0000 Subject: [PATCH] MAAS 3.5 compatibility fixes Change-Id: I77d4f6451a09118846c926743aa3bd9638fdee99 Signed-off-by: Sergiy Markin --- charts/drydock/values.yaml | 14 + doc/source/_static/drydock.conf.sample | 53 +- etc/drydock/drydock.conf.sample | 53 +- python/drydock_provisioner/config.py | 16 + .../drivers/node/maasdriver/actions/node.py | 507 ++++++++++-------- .../drivers/node/maasdriver/models/base.py | 10 +- .../drivers/node/maasdriver/models/fabric.py | 6 +- .../node/maasdriver/models/interface.py | 28 +- .../drivers/node/maasdriver/models/machine.py | 2 +- 9 files changed, 453 insertions(+), 236 deletions(-) diff --git a/charts/drydock/values.yaml b/charts/drydock/values.yaml index 64e29d56..939c5cfb 100644 --- a/charts/drydock/values.yaml +++ b/charts/drydock/values.yaml @@ -384,6 +384,20 @@ conf: threads: 1 workers: 1 drydock: + networkconfig: + # Controls ApplyNetworkConfig subtask, which applies network + # configuration to nodes after they reach the Ready state. + # ApplyNetworkConfig must succeed before MAAS deployment starts. + # In MAAS 3.5, fabrics may intermittently disappear, causing + # minor discrepancies in this process. + # To work around this, ApplyNetworkConfig retries up to + # 'max_retries' times, with random delays between 'retry_min_delay' + # and 'retry_max_delay' seconds. + # This increases the chance of successful network config + # application despite transient MAAS fabric issues. + max_retries: 10 + retry_min_delay: 10 + retry_max_delay: 60 logging: log_level: 'DEBUG' global_logger_name: 'drydock' diff --git a/doc/source/_static/drydock.conf.sample b/doc/source/_static/drydock.conf.sample index 323d4df9..4846b3db 100644 --- a/doc/source/_static/drydock.conf.sample +++ b/doc/source/_static/drydock.conf.sample @@ -258,6 +258,16 @@ # used to validate tokens that have restricted access rules. (string value) #service_type = +# Enable the SASL(Simple Authentication and Security Layer) if the SASL_enable +# is true, else disable. (boolean value) +#memcache_sasl_enabled = false + +# the user name for the SASL (string value) +#memcache_username = + +# the username password for SASL (string value) +#memcache_password = + # Authentication type to load (string value) # Deprecated group/name - [keystone_authtoken]/auth_plugin #auth_type = @@ -344,19 +354,40 @@ #http_client_retries = 3 +[networkconfig] + +# +# From drydock_provisioner +# + +# Maximum number of retries for network configuration (integer value) +#max_retries = 10 + +# Minimum delay between retries in seconds (integer value) +#retry_min_delay = 10 + +# Maximum delay between retries in seconds (integer value) +#retry_max_delay = 60 + + [oslo_policy] # # From oslo.policy # -# This option controls whether or not to enforce scope when evaluating policies. -# If ``True``, the scope of the token used in the request is compared to the -# ``scope_types`` of the policy being enforced. If the scopes do not match, an -# ``InvalidScope`` exception will be raised. If ``False``, a message will be -# logged informing operators that policies are being invoked with mismatching -# scope. (boolean value) -#enforce_scope = false +# DEPRECATED: This option controls whether or not to enforce scope when +# evaluating policies. If ``True``, the scope of the token used in the request +# is compared to the ``scope_types`` of the policy being enforced. If the scopes +# do not match, an ``InvalidScope`` exception will be raised. If ``False``, a +# message will be logged informing operators that policies are being invoked +# with mismatching scope. (boolean value) +# This option is deprecated for removal. +# Its value may be silently ignored in the future. +# Reason: This configuration was added temporarily to facilitate a smooth +# transition to the new RBAC. OpenStack will always enforce scope checks. This +# configuration option is deprecated and will be removed in the 2025.2 cycle. +#enforce_scope = true # This option controls whether or not to use old deprecated defaults when # evaluating policies. If ``True``, the old deprecated defaults are not going to @@ -367,12 +398,12 @@ # deprecated policy check string is logically OR'd with the new policy check # string, allowing for a graceful upgrade experience between releases with new # policies, which is the default behavior. (boolean value) -#enforce_new_defaults = false +#enforce_new_defaults = true # The relative or absolute path of a file that maps roles to permissions for a # given service. Relative paths must be specified in relation to the # configuration file setting this option. (string value) -#policy_file = policy.json +#policy_file = policy.yaml # Default rule. Enforced when a requested rule is not found. (string value) #policy_default_rule = default @@ -403,6 +434,10 @@ # Absolute path client key file REST based policy check (string value) #remote_ssl_client_key_file = +# Timeout in seconds for REST based policy check (floating point value) +# Minimum value: 0 +#remote_timeout = 60 + [plugins] diff --git a/etc/drydock/drydock.conf.sample b/etc/drydock/drydock.conf.sample index 323d4df9..4846b3db 100644 --- a/etc/drydock/drydock.conf.sample +++ b/etc/drydock/drydock.conf.sample @@ -258,6 +258,16 @@ # used to validate tokens that have restricted access rules. (string value) #service_type = +# Enable the SASL(Simple Authentication and Security Layer) if the SASL_enable +# is true, else disable. (boolean value) +#memcache_sasl_enabled = false + +# the user name for the SASL (string value) +#memcache_username = + +# the username password for SASL (string value) +#memcache_password = + # Authentication type to load (string value) # Deprecated group/name - [keystone_authtoken]/auth_plugin #auth_type = @@ -344,19 +354,40 @@ #http_client_retries = 3 +[networkconfig] + +# +# From drydock_provisioner +# + +# Maximum number of retries for network configuration (integer value) +#max_retries = 10 + +# Minimum delay between retries in seconds (integer value) +#retry_min_delay = 10 + +# Maximum delay between retries in seconds (integer value) +#retry_max_delay = 60 + + [oslo_policy] # # From oslo.policy # -# This option controls whether or not to enforce scope when evaluating policies. -# If ``True``, the scope of the token used in the request is compared to the -# ``scope_types`` of the policy being enforced. If the scopes do not match, an -# ``InvalidScope`` exception will be raised. If ``False``, a message will be -# logged informing operators that policies are being invoked with mismatching -# scope. (boolean value) -#enforce_scope = false +# DEPRECATED: This option controls whether or not to enforce scope when +# evaluating policies. If ``True``, the scope of the token used in the request +# is compared to the ``scope_types`` of the policy being enforced. If the scopes +# do not match, an ``InvalidScope`` exception will be raised. If ``False``, a +# message will be logged informing operators that policies are being invoked +# with mismatching scope. (boolean value) +# This option is deprecated for removal. +# Its value may be silently ignored in the future. +# Reason: This configuration was added temporarily to facilitate a smooth +# transition to the new RBAC. OpenStack will always enforce scope checks. This +# configuration option is deprecated and will be removed in the 2025.2 cycle. +#enforce_scope = true # This option controls whether or not to use old deprecated defaults when # evaluating policies. If ``True``, the old deprecated defaults are not going to @@ -367,12 +398,12 @@ # deprecated policy check string is logically OR'd with the new policy check # string, allowing for a graceful upgrade experience between releases with new # policies, which is the default behavior. (boolean value) -#enforce_new_defaults = false +#enforce_new_defaults = true # The relative or absolute path of a file that maps roles to permissions for a # given service. Relative paths must be specified in relation to the # configuration file setting this option. (string value) -#policy_file = policy.json +#policy_file = policy.yaml # Default rule. Enforced when a requested rule is not found. (string value) #policy_default_rule = default @@ -403,6 +434,10 @@ # Absolute path client key file REST based policy check (string value) #remote_ssl_client_key_file = +# Timeout in seconds for REST based policy check (floating point value) +# Minimum value: 0 +#remote_timeout = 60 + [plugins] diff --git a/python/drydock_provisioner/config.py b/python/drydock_provisioner/config.py index 0173a5c2..0148f3d6 100644 --- a/python/drydock_provisioner/config.py +++ b/python/drydock_provisioner/config.py @@ -123,6 +123,19 @@ class DrydockConfig(object): default='http://localhost:9000/api/v1.0/bootactions/') ] + # Options for the apply network configuration framework + networkconfig_options = [ + cfg.IntOpt('max_retries', + default=10, + help='Maximum number of retries for network configuration'), + cfg.IntOpt('retry_min_delay', + default=10, + help='Minimum delay between retries in seconds'), + cfg.IntOpt('retry_max_delay', + default=60, + help='Maximum delay between retries in seconds') + ] + # Options for network traffic network_options = [ cfg.IntOpt( @@ -228,6 +241,8 @@ class DrydockConfig(object): def register_options(self, enable_keystone=True): self.conf.register_opts(DrydockConfig.options) + self.conf.register_opts(DrydockConfig.networkconfig_options, + group='networkconfig') self.conf.register_opts(DrydockConfig.bootactions_options, group='bootactions') self.conf.register_opts(DrydockConfig.logging_options, group='logging') @@ -254,6 +269,7 @@ def list_opts(): 'timeouts': DrydockConfig.timeout_options, 'database': DrydockConfig.database_options, 'network': DrydockConfig.network_options, + 'networkconfig': DrydockConfig.networkconfig_options, } package_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/python/drydock_provisioner/drivers/node/maasdriver/actions/node.py b/python/drydock_provisioner/drivers/node/maasdriver/actions/node.py index 42970cfb..6c0b8885 100644 --- a/python/drydock_provisioner/drivers/node/maasdriver/actions/node.py +++ b/python/drydock_provisioner/drivers/node/maasdriver/actions/node.py @@ -14,6 +14,7 @@ """Task driver for completing node provisioning with Canonical MaaS 2.2+.""" import time +import secrets # Use secrets for cryptographic random numbers import logging import re import math @@ -45,7 +46,6 @@ import drydock_provisioner.drivers.node.maasdriver.models.volumegroup as maas_vg import drydock_provisioner.drivers.node.maasdriver.models.repository as maas_repo import drydock_provisioner.drivers.node.maasdriver.models.domain as maas_domain - class BaseMaasAction(BaseAction): def __init__(self, *args, maas_client=None): @@ -1375,6 +1375,9 @@ class ApplyNodeNetworking(BaseMaasAction): """Action to configure networking on a node.""" def start(self): + MAX_RETRIES = config.config_mgr.conf.networkconfig.max_retries + RETRY_MIN_DELAY = config.config_mgr.conf.networkconfig.retry_min_delay + RETRY_MAX_DELAY = config.config_mgr.conf.networkconfig.retry_max_delay try: maas_machine.Machines(self.maas_client).empty_refresh() @@ -1461,233 +1464,311 @@ class ApplyNodeNetworking(BaseMaasAction): ctx_type='node') if machine.status_name == 'Ready': - msg = "Located node %s in MaaS, starting interface configuration" % ( - n.name) - self.logger.debug(msg) - self.task.add_status_msg(msg=msg, - error=False, - ctx=n.name, - ctx_type='node') - - machine.reset_network_config() - machine.refresh() - - for i in n.interfaces: - if not i.network_link: + for attempt in range(MAX_RETRIES): + try: self.logger.debug( - "Interface %s has no network link, skipping configuration." - % (i.device_name)) - continue + "Attempting to configure networking for node %s (Attempt %d)" % ( + n.name, attempt + 1)) - nl = site_design.get_network_link(i.network_link) - - if nl.metalabels is not None: - if 'noconfig' in nl.metalabels: - self.logger.info( - "Interface %s connected to NetworkLink %s marked 'noconfig', skipping." - % (i.device_name, nl.name)) - continue - - fabric = fabrics.singleton({'name': nl.name}) - - if fabric is None: - msg = "No fabric found for NetworkLink %s" % ( - nl.name) - self.logger.error(msg) + msg = "Located node %s in MaaS, starting interface configuration" % ( + n.name) + self.logger.debug(msg) self.task.add_status_msg(msg=msg, - error=True, + error=False, ctx=n.name, ctx_type='node') - self.task.failure(focus=n.name) - continue - if nl.bonding_mode != hd_fields.NetworkLinkBondingMode.Disabled: - if len(i.get_hw_slaves()) >= 1: - msg = "Building node %s interface %s as a bond." % ( - n.name, i.device_name) - self.logger.debug(msg) - self.task.add_status_msg(msg=msg, - error=False, - ctx=n.name, - ctx_type='node') - hw_iface_list = i.get_hw_slaves() - hw_iface_logicalname_list = [] - for hw_iface in hw_iface_list: - hw_iface_logicalname_list.append( - n.get_logicalname(hw_iface)) - iface = machine.interfaces.create_bond( - device_name=i.device_name, - parent_names=hw_iface_logicalname_list, - mtu=nl.mtu, - fabric=fabric.resource_id, - mode=nl.bonding_mode, - monitor_interval=nl.bonding_mon_rate, - downdelay=nl.bonding_down_delay, - updelay=nl.bonding_up_delay, - lacp_rate=nl.bonding_peer_rate, - hash_policy=nl.bonding_xmit_hash) - else: - msg = "Network link %s indicates bonding, " \ - "interface %s has less than 2 slaves." % \ - (nl.name, i.device_name) - self.logger.warning(msg) - self.task.add_status_msg(msg=msg, - error=True, - ctx=n.name, - ctx_type='node') - self.task.failure(focus=n.name) - continue - else: - if len(i.get_hw_slaves()) > 1: - msg = "Network link %s disables bonding, interface %s has multiple slaves." % \ - (nl.name, i.device_name) - self.logger.warning(msg) - self.task.add_status_msg(msg=msg, - error=True, - ctx=n.name, - ctx_type='node') - self.task.failure(n.name) - continue - elif len(i.get_hw_slaves()) == 0: - msg = "Interface %s has 0 slaves." % ( - i.device_name) - self.logger.warning(msg) - self.task.add_status_msg(msg=msg, - error=True, - ctx=n.name, - ctx_type='node') - self.task.failure(focus=n.name) - else: - msg = "Configuring interface %s on node %s" % ( - i.device_name, n.name) - self.logger.debug(msg) - self.task.add_status_msg(msg=msg, - error=False, - ctx=n.name, - ctx_type='node') - hw_iface = i.get_hw_slaves()[0] - # TODO(sh8121att): HardwareProfile device alias integration - iface = machine.get_network_interface( - n.get_logicalname(hw_iface)) - - if iface is None: - msg = "Interface %s not found on node %s, skipping configuration" % ( - i.device_name, machine.resource_id) - self.logger.warning(msg) - self.task.add_status_msg(msg=msg, - error=True, - ctx=n.name, - ctx_type='node') - self.task.failure(focus=n.name) - continue - - if iface.fabric_id == fabric.resource_id: - self.logger.debug( - "Interface %s already attached to fabric_id %s" - % (i.device_name, fabric.resource_id)) - else: - self.logger.debug( - "Attaching node %s interface %s to fabric_id %s" - % (n.name, i.device_name, - fabric.resource_id)) - iface.attach_fabric( - fabric_id=fabric.resource_id) - - if iface.effective_mtu != nl.mtu: - self.logger.debug( - "Updating interface %s MTU to %s" % - (i.device_name, nl.mtu)) - iface.set_mtu(nl.mtu) - - for iface_net in getattr(i, 'networks', []): - dd_net = site_design.get_network(iface_net) - - if dd_net is not None: - link_iface = None - if iface_net == getattr( - nl, 'native_network', None): - # If a node interface is attached to the native network for a link - # then the interface itself should be linked to network, not a VLAN - # tagged interface + self.logger.debug("Reset network config for node %s." % (n.name)) + machine.reset_network_config() + self.logger.debug("Refreshing machine %s." % (n.name)) + machine.refresh() + self.logger.debug("Refreshing fabric and subnets list.") + fabrics.refresh() + subnets.refresh() + for i in n.interfaces: + if not i.network_link: self.logger.debug( - "Attaching node %s interface %s to untagged VLAN on fabric %s" - % (n.name, i.device_name, - fabric.resource_id)) - link_iface = iface - else: - # For non-native networks, we create VLAN tagged interfaces as children - # of this interface - vlan_options = { - 'vlan_tag': dd_net.vlan_id, - 'parent_name': iface.name, - } + "Interface %s has no network link, skipping configuration." + % (i.device_name)) + continue - if dd_net.mtu is not None: - vlan_options['mtu'] = dd_net.mtu + nl = site_design.get_network_link(i.network_link) - self.logger.debug( - "Creating tagged interface for VLAN %s on system %s interface %s" - % (dd_net.vlan_id, n.name, - i.device_name)) - - try: - link_iface = machine.interfaces.create_vlan( - **vlan_options) - except errors.DriverError as ex: - msg = "Error creating interface: %s" % str( - ex) - self.logger.info(msg) - self.task.add_status_msg( - msg=msg, - error=True, - ctx=n.name, - ctx_type='node') - self.task.failure(focus=n.name) + if nl.metalabels is not None: + if 'noconfig' in nl.metalabels: + self.logger.info( + "Interface %s connected to NetworkLink %s marked 'noconfig', skipping." + % (i.device_name, nl.name)) continue - link_options = {} - link_options[ - 'primary'] = True if iface_net == getattr( - n, 'primary_network', - None) else False - link_options['subnet_cidr'] = dd_net.cidr + self.logger.debug("Refreshing fabric list.") + fabrics.refresh() + fabric = fabrics.singleton({'name': nl.name}) - found = False - for a in getattr(n, 'addressing', []): - if a.network == iface_net: + if fabric is None: + msg = "No fabric found for NetworkLink %s" % ( + nl.name) + self.logger.error(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(focus=n.name) + continue + + if nl.bonding_mode != hd_fields.NetworkLinkBondingMode.Disabled: + if len(i.get_hw_slaves()) >= 1: + msg = "Building node %s interface %s as a bond." % ( + n.name, i.device_name) + self.logger.debug(msg) + self.logger.debug( + "Adding status message for node %s: %s" % (n.name, msg)) + self.task.add_status_msg(msg=msg, + error=False, + ctx=n.name, + ctx_type='node') + self.logger.debug( + "Fetching hardware slaves for interface %s on node %s." % ( + i.device_name, n.name)) + hw_iface_list = i.get_hw_slaves() + self.logger.debug("Hardware slaves fetched: %s" % hw_iface_list) + + hw_iface_logicalname_list = [] + for hw_iface in hw_iface_list: + logical_name = n.get_logicalname(hw_iface) + self.logger.debug( + "Mapping hardware slave %s to logical name %s." % ( + hw_iface, logical_name)) + hw_iface_logicalname_list.append(logical_name) + + msg = """Creating bond interface with parameters: + device_name=%s, + parent_names=%s, + mtu=%s, + fabric=%s, + mode=%s, + monitor_interval=%s, + downdelay=%s, + updelay=%s, + lacp_rate=%s, + hash_policy=%s""" % ( + i.device_name, + hw_iface_logicalname_list, + nl.mtu, + fabric.resource_id, + nl.bonding_mode, + nl.bonding_mon_rate, + nl.bonding_down_delay, + nl.bonding_up_delay, + nl.bonding_peer_rate, + nl.bonding_xmit_hash) + self.logger.debug(msg) + self.task.add_status_msg(msg=msg, + error=False, + ctx=n.name, + ctx_type='node') + + iface = machine.interfaces.create_bond( + device_name=i.device_name, + parent_names=hw_iface_logicalname_list, + mtu=nl.mtu, + fabric=fabric.resource_id, + mode=nl.bonding_mode, + monitor_interval=nl.bonding_mon_rate, + downdelay=nl.bonding_down_delay, + updelay=nl.bonding_up_delay, + lacp_rate=nl.bonding_peer_rate, + hash_policy=nl.bonding_xmit_hash) + else: + msg = "Network link %s indicates bonding, " \ + "interface %s has less than 2 slaves." % \ + (nl.name, i.device_name) + self.logger.warning(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(focus=n.name) + continue + else: + if len(i.get_hw_slaves()) > 1: + msg = """Network link %s disables bonding, + interface %s has multiple slaves.""" % \ + (nl.name, i.device_name) + self.logger.warning(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(n.name) + continue + elif len(i.get_hw_slaves()) == 0: + msg = "Interface %s has 0 slaves." % ( + i.device_name) + self.logger.warning(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(focus=n.name) + else: + msg = "Configuring interface %s on node %s" % ( + i.device_name, n.name) + self.logger.debug(msg) + self.task.add_status_msg(msg=msg, + error=False, + ctx=n.name, + ctx_type='node') + hw_iface = i.get_hw_slaves()[0] + # TODO(sh8121att): HardwareProfile device alias integration + iface = machine.get_network_interface( + n.get_logicalname(hw_iface)) + + if iface is None: + msg = "Interface %s not found on node %s, skipping configuration" % ( + i.device_name, machine.resource_id) + self.logger.warning(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(focus=n.name) + continue + + if iface.fabric_id == fabric.resource_id: + self.logger.debug( + "Interface %s already attached to fabric_id %s" + % (i.device_name, fabric.resource_id)) + else: + # Refresh the list of fabrics + self.logger.debug("Refreshing fabric list before attaching fabric.") + fabrics.refresh() + + # Validate the fabric_id + if not fabrics.contains(fabric.resource_id): + msg = "Fabric ID %s does not exist " + "in the refreshed list of fabrics." % fabric.resource_id + self.logger.error(msg) + raise errors.DriverError(msg) + + # Proceed with attaching the fabric + self.logger.debug( + "Attaching node %s interface %s to fabric_id %s" % ( + n.name, i.device_name, fabric.resource_id)) + iface.attach_fabric(fabric_id=fabric.resource_id) + + if iface.effective_mtu != nl.mtu: + self.logger.debug( + "Updating interface %s MTU to %s" % + (i.device_name, nl.mtu)) + iface.set_mtu(nl.mtu) + + for iface_net in getattr(i, 'networks', []): + dd_net = site_design.get_network(iface_net) + + if dd_net is not None: + link_iface = None + if iface_net == getattr( + nl, 'native_network', None): + # If a node interface is attached to the native network for a link + # then the interface itself should be linked to network, not a VLAN + # tagged interface + self.logger.debug( + "Attaching node %s interface %s to untagged VLAN on fabric %s" % ( + n.name, + i.device_name, + fabric.resource_id)) + link_iface = iface + else: + # For non-native networks, we create VLAN tagged interfaces as children + # of this interface + vlan_options = { + 'vlan_tag': dd_net.vlan_id, + 'parent_name': iface.name, + } + + if dd_net.mtu is not None: + vlan_options['mtu'] = dd_net.mtu + + self.logger.debug( + """Creating tagged interface for + VLAN %s on system %s interface %s""" % ( + dd_net.vlan_id, + n.name, + i.device_name)) + + try: + link_iface = machine.interfaces.create_vlan( + **vlan_options) + except errors.DriverError as ex: + msg = "Error creating interface: %s" % str( + ex) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + self.task.failure(focus=n.name) + continue + + link_options = {} link_options[ - 'ip_address'] = 'dhcp' if a.type == 'dhcp' else a.address - found = True + 'primary'] = True if iface_net == getattr( + n, 'primary_network', + None) else False + link_options['subnet_cidr'] = dd_net.cidr - if not found: - msg = "No addressed assigned to network %s for node %s, link is L2 only." % ( - iface_net, n.name) - self.logger.info(msg) - self.task.add_status_msg( - msg=msg, - error=False, - ctx=n.name, - ctx_type='node') - link_options['ip_address'] = None + found = False + for a in getattr(n, 'addressing', []): + if a.network == iface_net: + link_options[ + 'ip_address'] = 'dhcp' if a.type == 'dhcp' else a.address + found = True - msg = "Linking system %s interface %s to subnet %s" % ( - n.name, i.device_name, dd_net.cidr) - self.logger.info(msg) - self.task.add_status_msg(msg=msg, - error=False, - ctx=n.name, - ctx_type='node') + if not found: + msg = """No addressed assigned to network %s + for node %s, link is L2 only.""" % ( + iface_net, n.name) + self.logger.info(msg) + self.task.add_status_msg( + msg=msg, + error=False, + ctx=n.name, + ctx_type='node') + link_options['ip_address'] = None - link_iface.link_subnet(**link_options) - self.task.success(focus=n.name) - else: - self.task.failure(focus=n.name) - msg = "Did not find a defined Network %s to attach to interface" % iface_net - self.logger.error(msg) - self.task.add_status_msg(msg=msg, - error=True, - ctx=n.name, - ctx_type='node') + msg = "Linking system %s interface %s to subnet %s" % ( + n.name, i.device_name, dd_net.cidr) + self.logger.info(msg) + self.task.add_status_msg(msg=msg, + error=False, + ctx=n.name, + ctx_type='node') + + link_iface.link_subnet(**link_options) + self.task.success(focus=n.name) + else: + self.task.failure(focus=n.name) + msg = """Did not find a defined + Network %s to attach to interface""" % iface_net + self.logger.error(msg) + self.task.add_status_msg(msg=msg, + error=True, + ctx=n.name, + ctx_type='node') + break # Exit retry loop if successful + except Exception as ex: + self.logger.warning( + "Attempt %d failed for node %s: %s" % (attempt + 1, n.name, str(ex))) + if attempt < MAX_RETRIES - 1: + # Wait for a random time between 10 and 300 seconds + random_delay = RETRY_MIN_DELAY + \ + secrets.randbelow( + RETRY_MAX_DELAY - RETRY_MIN_DELAY + 1) + self.logger.debug( + f"Waiting for {random_delay} seconds before retrying...") + time.sleep(random_delay) elif machine.status_name == 'Broken': msg = ( "Located node %s in MaaS, status broken. Run " @@ -1727,7 +1808,7 @@ class ApplyNodeNetworking(BaseMaasAction): error=True, ctx=n.name, ctx_type='node') - self.task.failure() + self.task.failure(focus=n.name) except Exception as ex: msg = "Error configuring network for node %s: %s" % (n.name, str(ex)) diff --git a/python/drydock_provisioner/drivers/node/maasdriver/models/base.py b/python/drydock_provisioner/drivers/node/maasdriver/models/base.py index 1b1349e0..d5f8fa74 100644 --- a/python/drydock_provisioner/drivers/node/maasdriver/models/base.py +++ b/python/drydock_provisioner/drivers/node/maasdriver/models/base.py @@ -157,9 +157,17 @@ class ResourceBase(object): @classmethod def from_dict(cls, api_client, obj_dict): + """ + Create an instance of this resource class based on a dict + of MaaS type attributes. + """ refined_dict = {k: obj_dict.get(k, None) for k in cls.fields} if 'id' in obj_dict.keys(): - refined_dict['resource_id'] = obj_dict.get('id') + refined_dict['resource_id'] = obj_dict.get('id') # Map 'id' to 'resource_id' + + # Ensure 'resource_id' is set correctly + if refined_dict.get('resource_id') is None: + raise ValueError("Missing 'resource_id' in object dictionary.") i = cls(api_client, **refined_dict) return i diff --git a/python/drydock_provisioner/drivers/node/maasdriver/models/fabric.py b/python/drydock_provisioner/drivers/node/maasdriver/models/fabric.py index cc3568ce..839b9181 100644 --- a/python/drydock_provisioner/drivers/node/maasdriver/models/fabric.py +++ b/python/drydock_provisioner/drivers/node/maasdriver/models/fabric.py @@ -25,9 +25,13 @@ class Fabric(model_base.ResourceBase): def __init__(self, api_client, **kwargs): super(Fabric, self).__init__(api_client, **kwargs) + self.logger.debug(f"Initializing Fabric with resource_id: {self.resource_id}") - if getattr(self, 'resource_id', None): + # Explicitly check if resource_id is not None + if getattr(self, 'resource_id', None) is not None: self.refresh_vlans() + else: + self.logger.warning("Cannot refresh VLANs for Fabric without resource_id.") def refresh(self): super(Fabric, self).refresh() diff --git a/python/drydock_provisioner/drivers/node/maasdriver/models/interface.py b/python/drydock_provisioner/drivers/node/maasdriver/models/interface.py index 56647665..497d0baf 100644 --- a/python/drydock_provisioner/drivers/node/maasdriver/models/interface.py +++ b/python/drydock_provisioner/drivers/node/maasdriver/models/interface.py @@ -60,14 +60,21 @@ class Interface(model_base.ResourceBase): :param fabric_id: The MaaS resource ID of a network Fabric to connect to :param fabric_name: The name of a MaaS fabric to connect to """ + self.logger.debug( + "Starting attach_fabric with parameters: fabric_id=%s, fabric_name=%s" % ( + fabric_id, fabric_name)) + fabric = None fabrics = maas_fabric.Fabrics(self.api_client) + self.logger.debug("Refreshing fabric list.") fabrics.refresh() if fabric_id is not None: + self.logger.debug("Looking for fabric with ID: %s" % fabric_id) fabric = fabrics.select(fabric_id) elif fabric_name is not None: + self.logger.debug("Looking for fabric with name: %s" % fabric_name) fabric = fabrics.singleton({'name': fabric_name}) else: self.logger.warning("Must specify fabric_id or fabric_name") @@ -81,21 +88,38 @@ class Interface(model_base.ResourceBase): "Fabric not found in MaaS for fabric_id %s, fabric_name %s" % (fabric_id, fabric_name)) + self.logger.debug("Found fabric: %s" % fabric.resource_id) + + self.logger.debug("Refreshing VLAN list for fabric %s." % fabric.resource_id) + if hasattr(fabric, 'vlans'): + fabric.vlans.refresh() + else: + self.logger.error( + "Fabric object has no attribute 'vlans'. Type: %s, value: %s" % ( + type(fabric), str(fabric))) + raise errors.DriverError("Fabric object has no attribute 'vlans'") + # Locate the untagged VLAN for this fabric. + self.logger.debug("Looking for untagged VLAN (vid=0) on fabric %s" % fabric.resource_id) fabric_vlan = fabric.vlans.singleton({'vid': 0}) if fabric_vlan is None: self.logger.warning("Cannot locate untagged VLAN on fabric %s" % - (fabric_id)) + (fabric.resource_id)) raise errors.DriverError( - "Cannot locate untagged VLAN on fabric %s" % (fabric_id)) + "Cannot locate untagged VLAN on fabric %s" % (fabric.resource_id)) + + self.logger.debug("Found untagged VLAN: %s" % fabric_vlan.resource_id) self.vlan = fabric_vlan.resource_id self.logger.info( "Attaching interface %s on system %s to VLAN %s on fabric %s" % (self.resource_id, self.system_id, fabric_vlan.resource_id, fabric.resource_id)) + + self.logger.debug("Updating interface with new VLAN configuration.") self.update() + self.logger.debug("Interface %s successfully attached to fabric %s." % (self.resource_id, fabric.resource_id)) def is_linked(self, subnet_id): """Check if this interface is linked to the given subnet. diff --git a/python/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/python/drydock_provisioner/drivers/node/maasdriver/models/machine.py index 42a76b8a..8da06b58 100644 --- a/python/drydock_provisioner/drivers/node/maasdriver/models/machine.py +++ b/python/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -47,7 +47,7 @@ class Machine(model_base.ResourceBase): super(Machine, self).__init__(api_client, **kwargs) # Replace generic dicts with interface collection model - if getattr(self, 'resource_id', None): + if getattr(self, 'resource_id', None) is not None: self.interfaces = maas_interface.Interfaces( api_client, system_id=self.resource_id) self.interfaces.refresh()