diff --git a/jobs/pxc-mysql/spec b/jobs/pxc-mysql/spec index 7c555355b..eec4b2ecd 100644 --- a/jobs/pxc-mysql/spec +++ b/jobs/pxc-mysql/spec @@ -12,7 +12,6 @@ templates: db_init.erb: config/db_init disable_mysql_cli_history.sh.erb: config/disable_mysql_cli_history.sh - pre-stop.erb: bin/pre-stop galera-ca.pem.erb: certificates/galera-ca.pem galera-cert.pem.erb: certificates/galera-cert.pem galera-init-config.yml.erb: config/galera-init-config.yml @@ -20,12 +19,16 @@ templates: get-sequence-number.sh.erb: bin/get-sequence-number my.cnf.erb: config/my.cnf mylogin.cnf.erb: config/mylogin.cnf - pre-start.sh.erb: bin/pre-start pxc-sudoers: config/pxc-sudoers server-ca.pem.erb: certificates/server-ca.pem server-cert.pem.erb: certificates/server-cert.pem server-key.pem.erb: certificates/server-key.pem + # bosh lifecycle scripts + pre-stop.erb: bin/pre-stop + pre-start.sh.erb: bin/pre-start + post-start.sh.erb: bin/post-start + packages: - auto-tune-mysql - galera-init diff --git a/jobs/pxc-mysql/templates/post-start.sh.erb b/jobs/pxc-mysql/templates/post-start.sh.erb new file mode 100644 index 000000000..7b64534fe --- /dev/null +++ b/jobs/pxc-mysql/templates/post-start.sh.erb @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -o errexit -o nounset -o pipefail +# <%- if p('pxc_enabled') -%> +# start: pxc_enabled + +source /var/vcap/packages/pxc-utils/logging.sh + +check_bpm_pid() { + /var/vcap/jobs/bpm/bin/bpm pid pxc-mysql -p galera-init >/dev/null 2>&1 +} + +log "post-start: waiting for galera-init to become healthy on port 8114" +elapsed=0 +while ! curl -s -f -m 5 http://127.0.0.1:8114 > /dev/null && check_bpm_pid; do + sleep 1 + elapsed=$((elapsed + 1)) + if [[ $((elapsed % 30)) == 0 ]]; then + log "post-start: still waiting for galera-init after ${elapsed} seconds" + fi +done + +if ! check_bpm_pid; then + log "post-start: galera-init process for pxc-mysql job failed to start" + log "post-start: this is expected if cluster has lost quorum" + log "post-start: run the bootstrap errand to restore cluster quorum" + exit 1 +fi + +log "post-start: galera-init started successfully" + +# end: pxc_enabled +# <%- end -%> + diff --git a/jobs/pxc-mysql/templates/pre-start.sh.erb b/jobs/pxc-mysql/templates/pre-start.sh.erb index e6faf8858..efc211963 100644 --- a/jobs/pxc-mysql/templates/pre-start.sh.erb +++ b/jobs/pxc-mysql/templates/pre-start.sh.erb @@ -155,9 +155,6 @@ chown -R vcap:vcap ${datadir} rm -f /etc/my.cnf -check_bpm_pid() { - /var/vcap/jobs/bpm/bin/bpm pid pxc-mysql -p galera-init >/dev/null 2>&1 -} is_pxc57_datadir() { [[ -f /var/vcap/store/pxc-mysql/mysql/user.frm ]] @@ -188,19 +185,4 @@ if is_pxc80_datadir; then apply_pxc80_crash_recovery fi -if ! /var/vcap/jobs/bpm/bin/bpm start pxc-mysql -p galera-init; then - log "pre-start: galera-init failed to initialize" - exit 1 -fi - -while ! curl -s -f -m 5 http://127.0.0.1:8114 > /dev/null && check_bpm_pid; do - sleep 1 -done - -if ! check_bpm_pid; then - log "pre-start: galera-init failed to start" - exit 1 -fi - -log "pre-start: galera-init started successfully" <% end %> diff --git a/src/e2e-tests/bootstrap_test.go b/src/e2e-tests/bootstrap_test.go index de7d86b4a..380e84e8c 100644 --- a/src/e2e-tests/bootstrap_test.go +++ b/src/e2e-tests/bootstrap_test.go @@ -32,8 +32,15 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap") bosh.Operation("use-clustered.yml"), bosh.Operation("galera-agent-tls.yml"), bosh.Operation("test/seed-test-user.yml"), - bosh.Operation(`iaas/cluster.yml`), + bosh.Operation("iaas/cluster.yml"), )).To(Succeed()) + DeferCleanup(func() { + if CurrentSpecReport().Failed() { + return + } + + Expect(bosh.DeleteDeployment(deploymentName)).To(Succeed()) + }) Expect(bosh.RunErrand(deploymentName, "smoke-tests", "mysql/first")). To(Succeed()) @@ -46,22 +53,11 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap") Expect(err).NotTo(HaveOccurred()) db.SetMaxIdleConns(0) db.SetMaxOpenConns(1) - }) - BeforeAll(func() { - var err error galeraAgentPassword, err = credhub.GetCredhubPassword("/" + deploymentName + "/cf_mysql_mysql_galera_healthcheck_endpoint_password") Expect(err).NotTo(HaveOccurred()) }) - AfterAll(func() { - if CurrentSpecReport().Failed() { - return - } - - Expect(bosh.DeleteDeployment(deploymentName)).To(Succeed()) - }) - stopMySQL := func(c *http.Client, host string) { req, err := http.NewRequest(http.MethodPost, "https://"+host+":9201/stop_mysql", nil) Expect(err).NotTo(HaveOccurred()) @@ -186,4 +182,62 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap") Expect(data).To(Equal(deploymentName + ": data written with 3 nodes")) }) }) + + When("the entire cluster goes offline and a node is recreated", Label("recreate"), Ordered, func() { + BeforeAll(func() { + By("shutting down mysql on all nodes") + mysqlIps, err := bosh.InstanceIPs(deploymentName, bosh.MatchByInstanceGroup("mysql")) + Expect(err).NotTo(HaveOccurred()) + + for _, ip := range mysqlIps { + GinkgoWriter.Println("Stopping MySQL on instance=" + ip) + stopMySQL(httpClient, ip) + } + + By("waiting for BOSH to detect the failing instances") + Eventually(func() (states []string) { + instances, err := bosh.Instances(deploymentName, bosh.MatchByInstanceGroup("mysql")) + if err != nil { + return nil + } + for _, instance := range instances { + states = append(states, instance.ProcessState) + } + + return states + }, "30s", "2s").Should(ConsistOf("failing", "failing", "failing"), + "Expected all mysql instances to be in failing state after stopping MySQL") + + Expect(bosh.Recreate(deploymentName, "mysql/0")).ToNot(Succeed(), + `Expected recreating mysql/0 when cluster is offline to fail`) + }) + + It("can still recover an offline cluster by running the bootstrap errand", func() { + // Run the bootstrap errand and expect it to succeed; The cluster quorum should now be restored. + Expect(bosh.RunErrand(deploymentName, "bootstrap", "mysql/0")).To(Succeed()) + + // Observe the cluster size return to a healthy number + var unused, clusterSize string + Expect(db.QueryRow(`SHOW GLOBAL STATUS LIKE 'wsrep\_cluster\_size'`). + Scan(&unused, &clusterSize)).To(Succeed()) + Expect(clusterSize).To(Equal("3")) + + // Validate data is still present on all three nodes + mysqlIps, err := bosh.InstanceIPs(deploymentName, bosh.MatchByInstanceGroup("mysql")) + Expect(err).NotTo(HaveOccurred()) + Expect(mysqlIps).To(HaveLen(3)) + for _, host := range mysqlIps { + db, err := sql.Open("mysql", "test-admin:integration-tests@tcp("+host+")/pxc_release_test_db?tls=skip-verify") + Expect(err).NotTo(HaveOccurred()) + var data string + Expect(db.QueryRow(`SELECT test_data FROM pxc_release_test_db.bootstrap_test`). + Scan(&data)).To(Succeed()) + Expect(data).To(Equal(deploymentName + ": data written with 3 nodes")) + } + + // Validate data can still be written through the proxy + Expect(bosh.RunErrand(deploymentName, "smoke-tests", "mysql/first")). + To(Succeed()) + }) + }) }) diff --git a/src/e2e-tests/utilities/bosh/bosh.go b/src/e2e-tests/utilities/bosh/bosh.go index b07f1056d..c4c2891b4 100644 --- a/src/e2e-tests/utilities/bosh/bosh.go +++ b/src/e2e-tests/utilities/bosh/bosh.go @@ -21,10 +21,11 @@ type DeployOptionFunc func(args *[]string) type MatchInstanceFunc func(instance Instance) bool type Instance struct { - IP string `json:"ips"` - Instance string `json:"instance"` - Index string `json:"index"` - VMCid string `json:"vm_cid"` + IP string `json:"ips"` + Instance string `json:"instance"` + Index string `json:"index"` + VMCid string `json:"vm_cid"` + ProcessState string `json:"process_state"` } func CloudCheck(deploymentName string) error { @@ -273,6 +274,18 @@ func Restart(deploymentName, instanceSpec string) error { ) } +func Recreate(deploymentName, instanceSpec string) error { + return cmd.Run( + "bosh", + "--deployment="+deploymentName, + "--non-interactive", + "--tty", + "recreate", + "--no-converge", + instanceSpec, + ) +} + func RemoteCommand(deploymentName, instanceSpec, cmdString string) (string, error) { var output bytes.Buffer if err := cmd.RunWithoutOutput(io.MultiWriter(&output, GinkgoWriter), diff --git a/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager.go b/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager.go index 4da29316e..e82313ce5 100644 --- a/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager.go +++ b/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager.go @@ -93,10 +93,6 @@ func (m *NodeManager) StopService(_ *http.Request) (string, error) { m.Mutex.Lock() defer m.Mutex.Unlock() - if err := os.WriteFile(m.StateFilePath, []byte("SINGLE_NODE"), 0777); err != nil { - return "", fmt.Errorf("failed to initialize state file: %w", err) - } - if err := m.MonitClient.Stop(m.ServiceName); err != nil { return "", err } diff --git a/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager_test.go b/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager_test.go index 80991b24c..9274e21df 100644 --- a/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager_test.go +++ b/src/github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_manager_test.go @@ -8,12 +8,13 @@ import ( "sync" "code.cloudfoundry.org/lager/v3/lagertest" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" "github.com/onsi/gomega/ghttp" "github.com/cloudfoundry-incubator/galera-healthcheck/node_manager" "github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_managerfakes" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" ) var _ = Describe("NodeManager", func() { @@ -71,7 +72,7 @@ var _ = Describe("NodeManager", func() { }) When("monit starts successfully", func() { - When("the service fails during initailization", func() { + When("the service fails during initialization", func() { BeforeEach(func() { fakeMonit.StartReturns(nil) fakeMonit.StatusReturns("failing", nil) @@ -183,8 +184,8 @@ var _ = Describe("NodeManager", func() { }) }) - When("joining an existing cluter", func() { - When("the service fails during initailization", func() { + When("joining an existing cluster", func() { + When("the service fails during initialization", func() { BeforeEach(func() { fakeMonit.StartReturns(nil) fakeMonit.StatusReturns("failing", nil) @@ -297,7 +298,7 @@ var _ = Describe("NodeManager", func() { }) When("monit starts successfully", func() { - When("the service fails during initailization", func() { + When("the service fails during initialization", func() { BeforeEach(func() { fakeMonit.StartReturns(nil) fakeMonit.StatusReturns("failing", nil) @@ -399,6 +400,18 @@ var _ = Describe("NodeManager", func() { Expect(err).NotTo(HaveOccurred()) Expect(msg).To(Equal(`stop successful`)) }) + + It("does not modify the state file", func() { + Expect(os.WriteFile(mgr.StateFilePath, []byte("PRE_EXISTING_CLUSTER_STATE"), 0o0644)).To(Succeed()) + + _, err := mgr.StopService(nil) + Expect(err).NotTo(HaveOccurred()) + + contents, err := os.ReadFile(mgr.StateFilePath) + Expect(err).NotTo(HaveOccurred()) + + Expect(string(contents)).To(Equal("PRE_EXISTING_CLUSTER_STATE")) + }) }) })