Merge pull request #940 from ekholabs/fix/rvr__keepalived_restart

CLOUDSTACK-8952 - The redundant routers are facing a race condition due to several KeepaliveD/ConntrackD restartsThis PR fixes the following issues:

* KeepAliveD being restarted for each action performed on the routers
* ConntrackD configuration being copied for each action performed on the routers, causing several restarts
* ACS Management Server relying in the JSON file to report which router is Master/Backup
* Public Interface on both routers are in UP state due to several places checking if the interface is UP/DOWN and trying to do KeepAliveD
* Removing all the sleeps from the test_vpc_redundant.py - those are no longer needed
* When KeepAliveD calls master.py during the election, update the cmdline.json to set the router in Backup mode: the election will take care of changing it afterwards.
* Add LB stats_rules to iptables INPUT chain
* The RVR public interface is set to eth2 instead of eth1 - as in the rVPC. Make sure the check works in both cases

Those fixes make all the routers very stable, with ACL, FW, PF and LB working just fine!

* pr/940:
  CLOUDSTACK-8952 - Make the checkrouter.sh compatible with RVR as well
  CLOUDSTACK-8952 - Make the tests rely on the interface state other than the json file
  CLOUDSTACK-8952 - Reduce retried from 20 to 5
  CLOUDSTACK-8952 - Do not rely in the router state on the json file to report back to ACS
  CLOUDSTACK-8952 - Make the check for master more reliable
  CLOUDSTACK-8952 - Restart dnsmasq everytime the configure.py runs
  CLOUDSTACK-8952 - Make sure the calls to CsFile use the new logic of commit/is_changed methods
  CLOUDSTACK-8952 - Make sure we restart dnsmasq if the configuration file changes
  CLOUDSTACK-8952 - The public interface was comming UP in the Backup router
  CLOUDSTACK-8952 - Do not restart conntrackd unless it's needed
  CLOUDSTACK-8952 - Do not replace the conntrackd config file unless it's needed
  CLOUDSTACK-8952 - Remove the '--vrrp' search criteria form the CsProcess constructor call

Signed-off-by: Remi Bergsma <github@remi.nl>
This commit is contained in:
Remi Bergsma 2015-10-20 08:00:03 +02:00
commit 6fe5ae0d60
11 changed files with 147 additions and 75 deletions

View File

@ -16,9 +16,20 @@
# specific language governing permissions and limitations
# under the License.
STATUS=$(cat /etc/cloudstack/cmdline.json | grep redundant_state | awk '{print $2;}' | sed -e 's/[,\"]//g')
if [ "$?" -ne "0" ]
STATUS=UNKNOWN
INTERFACE=eth1
ROUTER_TYPE=$(cat /etc/cloudstack/cmdline.json | grep type | awk '{print $2;}' | sed -e 's/[,\"]//g')
if [ $ROUTER_TYPE = "router" ]
then
STATUS=MASTER
INTERFACE=eth2
fi
echo "Status: ${STATUS}"
ETH1_STATE=$(ip addr | grep $INTERFACE | grep state | awk '{print $9;}')
if [ $ETH1_STATE = "UP" ]
then
STATUS=MASTER
elif [ $ETH1_STATE = "DOWN" ]
then
STATUS=BACKUP
fi
echo "Status: ${STATUS}"

View File

@ -481,11 +481,11 @@ class CsSite2SiteVpn(CsDataBag):
file.addeq(" dpddelay=30")
file.addeq(" dpdtimeout=120")
file.addeq(" dpdaction=restart")
file.commit()
secret = CsFile(vpnsecretsfile)
secret.search("%s " % leftpeer, "%s %s: PSK \"%s\"" % (leftpeer, rightpeer, obj['ipsec_psk']))
secret.commit()
if secret.is_changed() or file.is_changed():
secret.commit()
file.commit()
logging.info("Configured vpn %s %s", leftpeer, rightpeer)
CsHelper.execute("ipsec auto --rereadall")
CsHelper.execute("ipsec --add vpn-%s" % rightpeer)

View File

@ -95,22 +95,6 @@ class CsAddress(CsDataBag):
return ip
return None
def check_if_link_exists(self,dev):
cmd="ip link show dev %s"%dev
result = CsHelper.execute(cmd)
if(len(result) != 0):
return True
else:
return False
def check_if_link_up(self,dev):
cmd="ip link show dev %s | tr '\n' ' ' | cut -d ' ' -f 9"%dev
result = CsHelper.execute(cmd)
if(result and result[0].lower() == "up"):
return True
else:
return False
def process(self):
for dev in self.dbag:
if dev == "id":
@ -118,11 +102,6 @@ class CsAddress(CsDataBag):
ip = CsIP(dev, self.config)
for address in self.dbag[dev]:
#check if link is up
if not self.check_if_link_up(dev):
cmd="ip link set %s up" % dev
CsHelper.execute(cmd)
ip.setAddress(address)
if ip.configured():
@ -328,7 +307,7 @@ class CsIP:
if " DOWN " in i:
cmd2 = "ip link set %s up" % self.getDevice()
# If redundant do not bring up public interfaces
# master.py and keepalived deal with tham
# master.py and keepalived will deal with them
if self.cl.is_redundant() and not self.is_public():
CsHelper.execute(cmd2)
# if not redundant bring everything up

View File

@ -50,8 +50,8 @@ class CsApache(CsApp):
file.search("Listen .*:80", "Listen %s:80" % (self.ip))
file.search("Listen .*:443", "Listen %s:443" % (self.ip))
file.search("ServerName.*", "\tServerName vhost%s.cloudinternal.com" % (self.dev))
file.commit()
if file.is_changed():
file.commit()
CsHelper.service("apache2", "restart")
self.fw.append(["", "front",

View File

@ -36,21 +36,23 @@ class CsDhcp(CsDataBag):
self.preseed()
self.cloud = CsFile(DHCP_HOSTS)
self.conf = CsFile(CLOUD_CONF)
length = len(self.conf)
for item in self.dbag:
if item == "id":
continue
self.add(self.dbag[item])
self.write_hosts()
if self.cloud.is_changed():
self.delete_leases()
self.configure_server()
# We restart DNSMASQ every time the configure.py is called in order to avoid lease problems.
CsHelper.service("dnsmasq", "restart")
self.conf.commit()
self.cloud.commit()
if self.conf.is_changed():
CsHelper.service("dnsmasq", "restart")
elif self.cloud.is_changed():
CsHelper.hup_dnsmasq("dnsmasq", "dnsmasq")
def configure_server(self):
# self.conf.addeq("dhcp-hostsfile=%s" % DHCP_HOSTS)
@ -131,8 +133,8 @@ class CsDhcp(CsDataBag):
file.repopulate()
for ip in self.hosts:
file.add("%s\t%s" % (ip, self.hosts[ip]))
file.commit()
if file.is_changed():
file.commit()
logging.info("Updated hosts file")
else:
logging.debug("Hosts file unchanged")

View File

@ -64,6 +64,9 @@ class CsFile:
handle.write(line)
handle.close()
logging.info("Wrote edited file %s" % self.filename)
self.config = list(self.new_config)
logging.info("Updated file in-cache configuration")
def dump(self):
for line in self.new_config:
@ -160,4 +163,6 @@ class CsFile:
def compare(self, o):
return (isinstance(o, self.__class__) and set(self.config) == set(o.new_config))
result = (isinstance(o, self.__class__) and set(self.config) == set(o.config))
logging.debug("Comparison of CsFiles content is ==> %s" % result)
return result

View File

@ -17,7 +17,6 @@
import logging
import os.path
import re
import shutil
from cs.CsDatabag import CsDataBag
from CsProcess import CsProcess
from CsFile import CsFile
@ -37,13 +36,14 @@ class CsLoadBalancer(CsDataBag):
return
config = self.dbag['config'][0]['configuration']
file1 = CsFile(HAPROXY_CONF_T)
file2 = CsFile(HAPROXY_CONF_P)
file1.empty()
for x in config:
[file1.append(w, -1) for w in x.split('\n')]
file1.commit()
file2 = CsFile(HAPROXY_CONF_P)
if not file2.compare(file1):
file1.commit()
shutil.copy2(HAPROXY_CONF_T, HAPROXY_CONF_P)
CsHelper.copy(HAPROXY_CONF_T, HAPROXY_CONF_P)
proc = CsProcess(['/var/run/haproxy.pid'])
if not proc.find():
@ -55,13 +55,15 @@ class CsLoadBalancer(CsDataBag):
add_rules = self.dbag['config'][0]['add_rules']
remove_rules = self.dbag['config'][0]['remove_rules']
self._configure_firewall(add_rules, remove_rules)
stat_rules = self.dbag['config'][0]['stat_rules']
self._configure_firewall(add_rules, remove_rules, stat_rules)
def _configure_firewall(self, add_rules, remove_rules):
def _configure_firewall(self, add_rules, remove_rules, stat_rules):
firewall = self.config.get_fw()
logging.debug("CsLoadBalancer:: configuring firewall. Add rules ==> %s" % add_rules)
logging.debug("CsLoadBalancer:: configuring firewall. Remove rules ==> %s" % remove_rules)
logging.debug("CsLoadBalancer:: configuring firewall. Stat rules ==> %s" % stat_rules)
for rules in add_rules:
path = rules.split(':')
@ -74,3 +76,9 @@ class CsLoadBalancer(CsDataBag):
ip = path[0]
port = path[1]
firewall.append(["filter", "", "-D INPUT -p tcp -m tcp -d %s --dport %s -m state --state NEW -j ACCEPT" % (ip, port)])
for rules in stat_rules:
path = rules.split(':')
ip = path[0]
port = path[1]
firewall.append(["filter", "", "-A INPUT -p tcp -m tcp -d %s --dport %s -m state --state NEW -j ACCEPT" % (ip, port)])

View File

@ -82,12 +82,10 @@ class CsRedundant(object):
def _redundant_on(self):
guest = self.address.get_guest_if()
# No redundancy if there is no guest network
if self.cl.is_master() or guest is None:
for obj in [o for o in self.address.get_ips() if o.is_public()]:
self.check_is_up(obj.get_device())
if guest is None:
self._redundant_off()
return
CsHelper.mkdir(self.CS_RAMDISK_DIR, 0755, False)
CsHelper.mount_tmpfs(self.CS_RAMDISK_DIR)
CsHelper.mkdir(self.CS_ROUTER_DIR, 0755, False)
@ -102,10 +100,6 @@ class CsRedundant(object):
"%s/%s" % (self.CS_TEMPLATES_DIR, "keepalived.conf.templ"), self.KEEPALIVED_CONF)
CsHelper.copy_if_needed(
"%s/%s" % (self.CS_TEMPLATES_DIR, "checkrouter.sh.templ"), "/opt/cloud/bin/checkrouter.sh")
#The file is always copied so the RVR doesn't't get the wrong config.
#Concerning the r-VPC, the configuration will be applied in a different manner
CsHelper.copy(
"%s/%s" % (self.CS_TEMPLATES_DIR, "conntrackd.conf.templ"), self.CONNTRACKD_CONF)
CsHelper.execute(
'sed -i "s/--exec\ \$DAEMON;/--exec\ \$DAEMON\ --\ --vrrp;/g" /etc/init.d/keepalived')
@ -127,12 +121,16 @@ class CsRedundant(object):
" auth_type AH \n", " auth_pass %s\n" % self.cl.get_router_password()])
keepalived_conf.section(
"virtual_ipaddress {", "}", self._collect_ips())
keepalived_conf.commit()
# conntrackd configuration
connt = CsFile(self.CONNTRACKD_CONF)
conntrackd_template_conf = "%s/%s" % (self.CS_TEMPLATES_DIR, "conntrackd.conf.templ")
conntrackd_temp_bkp = "%s/%s" % (self.CS_TEMPLATES_DIR, "conntrackd.conf.templ.bkp")
CsHelper.copy(conntrackd_template_conf, conntrackd_temp_bkp)
conntrackd_tmpl = CsFile(conntrackd_template_conf)
if guest is not None:
connt.section("Multicast {", "}", [
conntrackd_tmpl.section("Multicast {", "}", [
"IPv4_address 225.0.0.50\n",
"Group 3780\n",
"IPv4_interface %s\n" % guest.get_ip(),
@ -140,12 +138,21 @@ class CsRedundant(object):
"SndSocketBuffer 1249280\n",
"RcvSocketBuffer 1249280\n",
"Checksum on\n"])
connt.section("Address Ignore {", "}", self._collect_ignore_ips())
connt.commit()
conntrackd_tmpl.section("Address Ignore {", "}", self._collect_ignore_ips())
conntrackd_tmpl.commit()
if connt.is_changed():
conntrackd_conf = CsFile(self.CONNTRACKD_CONF)
is_equals = conntrackd_tmpl.compare(conntrackd_conf)
proc = CsProcess(['/etc/conntrackd/conntrackd.conf'])
if not proc.find() or not is_equals:
CsHelper.copy(conntrackd_template_conf, self.CONNTRACKD_CONF)
CsHelper.service("conntrackd", "restart")
# Restore the template file and remove the backup.
CsHelper.copy(conntrackd_temp_bkp, conntrackd_template_conf)
CsHelper.execute("rm -rf %s" % conntrackd_temp_bkp)
# Configure heartbeat cron job - runs every 30 seconds
heartbeat_cron = CsFile("/etc/cron.d/heartbeat")
heartbeat_cron.add("SHELL=/bin/bash", 0)
@ -173,8 +180,9 @@ class CsRedundant(object):
conntrackd_cron.add("@reboot root service conntrackd start", -1)
conntrackd_cron.commit()
proc = CsProcess(['/usr/sbin/keepalived', '--vrrp'])
proc = CsProcess(['/usr/sbin/keepalived'])
if not proc.find() or keepalived_conf.is_changed():
keepalived_conf.commit()
CsHelper.service("keepalived", "restart")
def release_lock(self):
@ -285,7 +293,6 @@ class CsRedundant(object):
route.add_defaultroute(gateway)
except:
logging.error("ERROR getting gateway from device %s" % dev)
else:
logging.error("Device %s was not ready could not bring it up" % dev)
@ -300,6 +307,7 @@ class CsRedundant(object):
ads = [o for o in self.address.get_ips() if o.needs_vrrp()]
for o in ads:
CsPasswdSvc(o.get_gateway()).restart()
CsHelper.service("dnsmasq", "restart")
self.cl.set_master_state(True)
self.cl.save()
@ -326,7 +334,7 @@ class CsRedundant(object):
In a DomR there will only ever be one address in a VPC there can be many
The new code also gives the possibility to cloudstack to have a hybrid device
thet could function as a router and VPC router at the same time
that could function as a router and VPC router at the same time
"""
lines = []
for o in self.address.get_ips():
@ -337,12 +345,12 @@ class CsRedundant(object):
else:
str = " %s brd %s dev %s\n" % (o.get_gateway_cidr(), o.get_broadcast(), o.get_device())
lines.append(str)
self.check_is_up(o.get_device())
return lines
def check_is_up(self, device):
""" Ensure device is up """
cmd = "ip link show %s | grep 'state DOWN'" % device
for i in CsHelper.execute(cmd):
if " DOWN " in i:
cmd2 = "ip link set %s up" % device

View File

@ -42,6 +42,9 @@ logging.basicConfig(filename=config.get_logger(),
format=config.get_format())
config.cmdline()
cl = CsCmdLine("cmdline", config)
#Update the configuration to set state as backup and let keepalived decide who is the real Master
cl.set_master_state(False)
cl.save()
config.set_address()
red = CsRedundant(config)

View File

@ -16,9 +16,20 @@
# specific language governing permissions and limitations
# under the License.
STATUS=$(cat /etc/cloudstack/cmdline.json | grep redundant_state | awk '{print $2;}' | sed -e 's/[,\"]//g')
if [ "$?" -ne "0" ]
STATUS=UNKNOWN
INTERFACE=eth1
ROUTER_TYPE=$(cat /etc/cloudstack/cmdline.json | grep type | awk '{print $2;}' | sed -e 's/[,\"]//g')
if [ $ROUTER_TYPE = "router" ]
then
STATUS=MASTER
INTERFACE=eth2
fi
echo "Status: ${STATUS}"
ETH1_STATE=$(ip addr | grep $INTERFACE | grep state | awk '{print $9;}')
if [ $ETH1_STATE = "UP" ]
then
STATUS=MASTER
elif [ $ETH1_STATE = "DOWN" ]
then
STATUS=BACKUP
fi
echo "Status: ${STATUS}"

View File

@ -37,8 +37,11 @@ from marvin.lib.base import (stopRouter,
from marvin.lib.common import (get_domain,
get_zone,
get_template,
list_routers)
from marvin.lib.utils import cleanup_resources
list_routers,
list_hosts)
from marvin.lib.utils import (cleanup_resources,
get_process_status,
get_host_credentials)
import socket
import time
import inspect
@ -236,7 +239,10 @@ class TestVPCRedundancy(cloudstackTestCase):
self.routers = []
self.networks = []
self.ips = []
self.apiclient = self.testClient.getApiClient()
self.hypervisor = self.testClient.getHypervisorInfo()
self.account = Account.create(
self.apiclient,
self.services["account"],
@ -288,13 +294,59 @@ class TestVPCRedundancy(cloudstackTestCase):
len(self.routers), count,
"Check that %s routers were indeed created" % count)
def check_master_status(self, count=2, showall=False):
def check_master_status(self,count=2, showall=False):
vals = ["MASTER", "BACKUP", "UNKNOWN"]
cnts = [0, 0, 0]
result = "UNKNOWN"
self.query_routers(count, showall)
for router in self.routers:
if router.state == "Running":
cnts[vals.index(router.redundantstate)] += 1
hosts = list_hosts(
self.apiclient,
zoneid=router.zoneid,
type='Routing',
state='Up',
id=router.hostid
)
self.assertEqual(
isinstance(hosts, list),
True,
"Check list host returns a valid list"
)
host = hosts[0]
if self.hypervisor.lower() in ('vmware', 'hyperv'):
result = str(get_process_status(
self.apiclient.connection.mgtSvr,
22,
self.apiclient.connection.user,
self.apiclient.connection.passwd,
router.linklocalip,
"sh /opt/cloud/bin/checkrouter.sh ",
hypervisor=self.hypervisor
))
else:
try:
host.user, host.passwd = get_host_credentials(
self.config, host.ipaddress)
result = str(get_process_status(
host.ipaddress,
22,
host.user,
host.passwd,
router.linklocalip,
"sh /opt/cloud/bin/checkrouter.sh "
))
except KeyError:
self.skipTest(
"Marvin configuration has no host credentials to\
check router services")
if result.count(vals[0]) == 1:
cnts[vals.index(vals[0])] += 1
if cnts[vals.index('MASTER')] != 1:
self.fail("No Master or too many master routers found %s" % cnts[vals.index('MASTER')])
@ -458,26 +510,20 @@ class TestVPCRedundancy(cloudstackTestCase):
self.query_routers()
self.networks.append(self.create_network(self.services["network_offering"], "10.1.1.1"))
self.networks.append(self.create_network(self.services["network_offering_no_lb"], "10.1.2.1"))
time.sleep(30)
self.check_master_status(2)
self.add_nat_rules()
self.do_vpc_test(False)
time.sleep(30)
self.stop_router_by_type("MASTER")
# wait for the backup router to transit to master state
time.sleep(30)
self.check_master_status(1)
self.do_vpc_test(False)
self.delete_nat_rules()
time.sleep(45)
self.check_master_status(1)
self.do_vpc_test(True)
self.start_routers()
self.add_nat_rules()
time.sleep(30)
self.check_master_status(2)
self.do_vpc_test(False)
@ -488,7 +534,6 @@ class TestVPCRedundancy(cloudstackTestCase):
self.query_routers()
self.networks.append(self.create_network(self.services["network_offering"], "10.1.1.1"))
self.networks.append(self.create_network(self.services["network_offering_no_lb"], "10.1.2.1"))
time.sleep(30)
self.check_master_status(2)
self.add_nat_rules()
self.do_default_routes_test()
@ -510,7 +555,7 @@ class TestVPCRedundancy(cloudstackTestCase):
time.sleep(5)
def do_vpc_test(self, expectFail):
retries = 20
retries = 5
if expectFail:
retries = 2
for o in self.networks: