Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions jobs/pxc-mysql/spec
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,23 @@ templates:

db_init.erb: config/db_init
disable_mysql_cli_history.sh.erb: config/disable_mysql_cli_history.sh
pre-stop.erb: bin/pre-stop
galera-ca.pem.erb: certificates/galera-ca.pem
galera-cert.pem.erb: certificates/galera-cert.pem
galera-init-config.yml.erb: config/galera-init-config.yml
galera-key.pem.erb: certificates/galera-key.pem
get-sequence-number.sh.erb: bin/get-sequence-number
my.cnf.erb: config/my.cnf
mylogin.cnf.erb: config/mylogin.cnf
pre-start.sh.erb: bin/pre-start
pxc-sudoers: config/pxc-sudoers
server-ca.pem.erb: certificates/server-ca.pem
server-cert.pem.erb: certificates/server-cert.pem
server-key.pem.erb: certificates/server-key.pem

# bosh lifecycle scripts
pre-stop.erb: bin/pre-stop
pre-start.sh.erb: bin/pre-start
post-start.sh.erb: bin/post-start

packages:
- auto-tune-mysql
- galera-init
Expand Down
34 changes: 34 additions & 0 deletions jobs/pxc-mysql/templates/post-start.sh.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash

set -o errexit -o nounset -o pipefail
# <%- if p('pxc_enabled') -%>
# start: pxc_enabled

source /var/vcap/packages/pxc-utils/logging.sh

check_bpm_pid() {
/var/vcap/jobs/bpm/bin/bpm pid pxc-mysql -p galera-init >/dev/null 2>&1
}

log "post-start: waiting for galera-init to become healthy on port 8114"
elapsed=0
while ! curl -s -f -m 5 http://127.0.0.1:8114 > /dev/null && check_bpm_pid; do
sleep 1
elapsed=$((elapsed + 1))
if [[ $((elapsed % 30)) == 0 ]]; then
log "post-start: still waiting for galera-init after ${elapsed} seconds"
fi
done

if ! check_bpm_pid; then
log "post-start: galera-init process for pxc-mysql job failed to start"
log "post-start: this is expected if cluster has lost quorum"
log "post-start: run the bootstrap errand to restore cluster quorum"
exit 1
fi

log "post-start: galera-init started successfully"

# end: pxc_enabled
# <%- end -%>

18 changes: 0 additions & 18 deletions jobs/pxc-mysql/templates/pre-start.sh.erb
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,6 @@ chown -R vcap:vcap ${datadir}

rm -f /etc/my.cnf

check_bpm_pid() {
/var/vcap/jobs/bpm/bin/bpm pid pxc-mysql -p galera-init >/dev/null 2>&1
}

is_pxc57_datadir() {
[[ -f /var/vcap/store/pxc-mysql/mysql/user.frm ]]
Expand Down Expand Up @@ -188,19 +185,4 @@ if is_pxc80_datadir; then
apply_pxc80_crash_recovery
fi

if ! /var/vcap/jobs/bpm/bin/bpm start pxc-mysql -p galera-init; then
log "pre-start: galera-init failed to initialize"
exit 1
fi

while ! curl -s -f -m 5 http://127.0.0.1:8114 > /dev/null && check_bpm_pid; do
sleep 1
done

if ! check_bpm_pid; then
log "pre-start: galera-init failed to start"
exit 1
fi

log "pre-start: galera-init started successfully"
<% end %>
78 changes: 66 additions & 12 deletions src/e2e-tests/bootstrap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,15 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap")
bosh.Operation("use-clustered.yml"),
bosh.Operation("galera-agent-tls.yml"),
bosh.Operation("test/seed-test-user.yml"),
bosh.Operation(`iaas/cluster.yml`),
bosh.Operation("iaas/cluster.yml"),
)).To(Succeed())
DeferCleanup(func() {
if CurrentSpecReport().Failed() {
return
}

Expect(bosh.DeleteDeployment(deploymentName)).To(Succeed())
})

Expect(bosh.RunErrand(deploymentName, "smoke-tests", "mysql/first")).
To(Succeed())
Expand All @@ -46,22 +53,11 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap")
Expect(err).NotTo(HaveOccurred())
db.SetMaxIdleConns(0)
db.SetMaxOpenConns(1)
})

BeforeAll(func() {
var err error
galeraAgentPassword, err = credhub.GetCredhubPassword("/" + deploymentName + "/cf_mysql_mysql_galera_healthcheck_endpoint_password")
Expect(err).NotTo(HaveOccurred())
})

AfterAll(func() {
if CurrentSpecReport().Failed() {
return
}

Expect(bosh.DeleteDeployment(deploymentName)).To(Succeed())
})

stopMySQL := func(c *http.Client, host string) {
req, err := http.NewRequest(http.MethodPost, "https://"+host+":9201/stop_mysql", nil)
Expect(err).NotTo(HaveOccurred())
Expand Down Expand Up @@ -186,4 +182,62 @@ var _ = Describe("Bootstrapping an offline cluster", Ordered, Label("bootstrap")
Expect(data).To(Equal(deploymentName + ": data written with 3 nodes"))
})
})

When("the entire cluster goes offline and a node is recreated", Label("recreate"), Ordered, func() {
BeforeAll(func() {
By("shutting down mysql on all nodes")
mysqlIps, err := bosh.InstanceIPs(deploymentName, bosh.MatchByInstanceGroup("mysql"))
Expect(err).NotTo(HaveOccurred())

for _, ip := range mysqlIps {
GinkgoWriter.Println("Stopping MySQL on instance=" + ip)
stopMySQL(httpClient, ip)
}

By("waiting for BOSH to detect the failing instances")
Eventually(func() (states []string) {
instances, err := bosh.Instances(deploymentName, bosh.MatchByInstanceGroup("mysql"))
if err != nil {
return nil
}
for _, instance := range instances {
states = append(states, instance.ProcessState)
}

return states
}, "30s", "2s").Should(ConsistOf("failing", "failing", "failing"),
"Expected all mysql instances to be in failing state after stopping MySQL")

Expect(bosh.Recreate(deploymentName, "mysql/0")).ToNot(Succeed(),
`Expected recreating mysql/0 when cluster is offline to fail`)
})

It("can still recover an offline cluster by running the bootstrap errand", func() {
// Run the bootstrap errand and expect it to succeed; The cluster quorum should now be restored.
Expect(bosh.RunErrand(deploymentName, "bootstrap", "mysql/0")).To(Succeed())

// Observe the cluster size return to a healthy number
var unused, clusterSize string
Expect(db.QueryRow(`SHOW GLOBAL STATUS LIKE 'wsrep\_cluster\_size'`).
Scan(&unused, &clusterSize)).To(Succeed())
Expect(clusterSize).To(Equal("3"))

// Validate data is still present on all three nodes
mysqlIps, err := bosh.InstanceIPs(deploymentName, bosh.MatchByInstanceGroup("mysql"))
Expect(err).NotTo(HaveOccurred())
Expect(mysqlIps).To(HaveLen(3))
for _, host := range mysqlIps {
db, err := sql.Open("mysql", "test-admin:integration-tests@tcp("+host+")/pxc_release_test_db?tls=skip-verify")
Expect(err).NotTo(HaveOccurred())
var data string
Expect(db.QueryRow(`SELECT test_data FROM pxc_release_test_db.bootstrap_test`).
Scan(&data)).To(Succeed())
Expect(data).To(Equal(deploymentName + ": data written with 3 nodes"))
}

// Validate data can still be written through the proxy
Expect(bosh.RunErrand(deploymentName, "smoke-tests", "mysql/first")).
To(Succeed())
})
})
})
21 changes: 17 additions & 4 deletions src/e2e-tests/utilities/bosh/bosh.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ type DeployOptionFunc func(args *[]string)
type MatchInstanceFunc func(instance Instance) bool

type Instance struct {
IP string `json:"ips"`
Instance string `json:"instance"`
Index string `json:"index"`
VMCid string `json:"vm_cid"`
IP string `json:"ips"`
Instance string `json:"instance"`
Index string `json:"index"`
VMCid string `json:"vm_cid"`
ProcessState string `json:"process_state"`
}

func CloudCheck(deploymentName string) error {
Expand Down Expand Up @@ -273,6 +274,18 @@ func Restart(deploymentName, instanceSpec string) error {
)
}

func Recreate(deploymentName, instanceSpec string) error {
return cmd.Run(
"bosh",
"--deployment="+deploymentName,
"--non-interactive",
"--tty",
"recreate",
"--no-converge",
instanceSpec,
)
}

func RemoteCommand(deploymentName, instanceSpec, cmdString string) (string, error) {
var output bytes.Buffer
if err := cmd.RunWithoutOutput(io.MultiWriter(&output, GinkgoWriter),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,6 @@ func (m *NodeManager) StopService(_ *http.Request) (string, error) {
m.Mutex.Lock()
defer m.Mutex.Unlock()

if err := os.WriteFile(m.StateFilePath, []byte("SINGLE_NODE"), 0777); err != nil {
return "", fmt.Errorf("failed to initialize state file: %w", err)
}

if err := m.MonitClient.Stop(m.ServiceName); err != nil {
return "", err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ import (
"sync"

"code.cloudfoundry.org/lager/v3/lagertest"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/onsi/gomega/ghttp"

"github.com/cloudfoundry-incubator/galera-healthcheck/node_manager"
"github.com/cloudfoundry-incubator/galera-healthcheck/node_manager/node_managerfakes"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var _ = Describe("NodeManager", func() {
Expand Down Expand Up @@ -71,7 +72,7 @@ var _ = Describe("NodeManager", func() {
})

When("monit starts successfully", func() {
When("the service fails during initailization", func() {
When("the service fails during initialization", func() {
BeforeEach(func() {
fakeMonit.StartReturns(nil)
fakeMonit.StatusReturns("failing", nil)
Expand Down Expand Up @@ -183,8 +184,8 @@ var _ = Describe("NodeManager", func() {
})
})

When("joining an existing cluter", func() {
When("the service fails during initailization", func() {
When("joining an existing cluster", func() {
When("the service fails during initialization", func() {
BeforeEach(func() {
fakeMonit.StartReturns(nil)
fakeMonit.StatusReturns("failing", nil)
Expand Down Expand Up @@ -297,7 +298,7 @@ var _ = Describe("NodeManager", func() {
})

When("monit starts successfully", func() {
When("the service fails during initailization", func() {
When("the service fails during initialization", func() {
BeforeEach(func() {
fakeMonit.StartReturns(nil)
fakeMonit.StatusReturns("failing", nil)
Expand Down Expand Up @@ -399,6 +400,18 @@ var _ = Describe("NodeManager", func() {
Expect(err).NotTo(HaveOccurred())
Expect(msg).To(Equal(`stop successful`))
})

It("does not modify the state file", func() {
Expect(os.WriteFile(mgr.StateFilePath, []byte("PRE_EXISTING_CLUSTER_STATE"), 0o0644)).To(Succeed())

_, err := mgr.StopService(nil)
Expect(err).NotTo(HaveOccurred())

contents, err := os.ReadFile(mgr.StateFilePath)
Expect(err).NotTo(HaveOccurred())

Expect(string(contents)).To(Equal("PRE_EXISTING_CLUSTER_STATE"))
})
})
})

Expand Down