jepsen: add cluster reconfiguration nemesis

This commit is contained in:
Alex Auvolat 2023-10-20 15:48:37 +02:00
parent f5b0972781
commit 654775308e
4 changed files with 106 additions and 14 deletions

View file

@ -27,4 +27,5 @@ Vagrant.configure("2") do |config|
config.vm.define "n3" do |config| vm(config, "n3", "192.168.56.23") end
config.vm.define "n4" do |config| vm(config, "n4", "192.168.56.24") end
config.vm.define "n5" do |config| vm(config, "n5", "192.168.56.25") end
config.vm.define "n6" do |config| vm(config, "n6", "192.168.56.26") end
end

View file

@ -3,3 +3,4 @@
192.168.56.23
192.168.56.24
192.168.56.25
192.168.56.26

View file

@ -10,6 +10,7 @@
[jepsen.os.debian :as debian]
[jepsen.garage
[daemon :as grg]
[nemesis :as grgNemesis]
[reg :as reg]
[set :as set]]))
@ -20,6 +21,11 @@
"set1" set/workload1
"set2" set/workload2})
(def scenari
"A map of scenari to the associated nemesis"
{"cp" grgNemesis/scenario-cp
"r" grgNemesis/scenario-r})
(def patches
"A map of patch names to Garage builds"
{"default" "v0.9.0"
@ -31,6 +37,9 @@
[["-p" "--patch NAME" "Garage patch to use"
:default "default"
:validate [patches (cli/one-of patches)]]
["-s" "--scenario NAME" "Nemesis scenario to run"
:default "cp"
:validate [scenari (cli/one-of scenari)]]
["-r" "--rate HZ" "Approximate number of requests per second, per thread."
:default 10
:parse-fn read-string
@ -40,7 +49,7 @@
:parse-fn parse-long
:validate [pos? "Must be a positive integer."]]
["-w" "--workload NAME" "Workload of test to run"
:default "reg"
:default "reg1"
:validate [workloads (cli/one-of workloads)]]])
(defn garage-test
@ -48,6 +57,7 @@
:concurrency, ...), constructs a test map."
[opts]
(let [workload ((get workloads (:workload opts)) opts)
scenario ((get scenari (:scenario opts)) opts)
garage-version (get patches (:patch opts))]
(merge tests/noop-test
opts
@ -60,25 +70,14 @@
(->>
(:generator workload)
(gen/stagger (/ (:rate opts)))
(gen/nemesis
(cycle [(gen/sleep 5)
{:type :info, :f :partition-start}
(gen/sleep 5)
{:type :info, :f :clock-scramble}
(gen/sleep 5)
{:type :info, :f :partition-stop}
(gen/sleep 5)
{:type :info, :f :clock-scramble}]))
(gen/nemesis (:generator scenario))
(gen/time-limit (:time-limit opts)))
(gen/log "Healing cluster")
(gen/nemesis (gen/once {:type :info, :f :partition-stop}))
(gen/log "Waiting for recovery")
(gen/sleep 10)
(gen/clients (:final-generator workload)))
:nemesis (nemesis/compose
{{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves)
{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})
:nemesis (:nemesis scenario)
:checker (checker/compose
{:perf (checker/perf)
:workload (:checker workload)})

View file

@ -0,0 +1,91 @@
(ns jepsen.garage.nemesis
(:require [clojure.tools.logging :refer :all]
[jepsen [control :as c]
[core :as jepsen]
[generator :as gen]
[nemesis :as nemesis]]
[jepsen.garage.daemon :as grg]
[jepsen.control.util :as cu]))
(defn configure-present!
"Configure node to be active in new cluster layout"
[test node]
(info "configure-present!" node)
(let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
(c/on
(jepsen/primary test)
(c/exec grg/binary :layout :assign (subs node-id 0 16) :-c :1G))))
(defn configure-absent!
"Configure node to be active in new cluster layout"
[test node]
(info "configure-absent!" node)
(let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
(c/on
(jepsen/primary test)
(c/exec grg/binary :layout :assign (subs node-id 0 16) :-g))))
(defn finalize-config!
"Apply the proposed cluster layout"
[test]
(let [layout-show (c/on (jepsen/primary test) (c/exec grg/binary :layout :show))
[_ layout-next-version] (re-find #"apply --version (\d+)\n" layout-show)]
(info "layout show: " layout-show "; next-version: " layout-next-version)
(c/on (jepsen/primary test)
(c/exec grg/binary :layout :apply :--version layout-next-version))))
(defn reconfigure-subset
"Reconfigure cluster with only a subset of nodes"
[cnt]
(reify nemesis/Nemesis
(setup! [this test] this)
(invoke! [this test op] op
(case (:f op)
:start
(let [[keep-nodes remove-nodes]
(->> (:nodes test)
shuffle
(split-at cnt))]
(info "layout split: keep " keep-nodes ", remove " remove-nodes)
(run! #(configure-present! test %) keep-nodes)
(run! #(configure-absent! test %) remove-nodes)
(finalize-config! test)
(assoc op :value keep-nodes))
:stop
(do
(info "layout un-split: all nodes=" (:nodes test))
(run! #(configure-present! test %) (:nodes test))
(finalize-config! test)
(assoc op :value (:nodes test)))))
(teardown! [this test] this)))
(defn scenario-cp
"Clock scramble + parittion scenario"
[opts]
{:generator (cycle [(gen/sleep 5)
{:type :info, :f :partition-start}
(gen/sleep 5)
{:type :info, :f :clock-scramble}
(gen/sleep 5)
{:type :info, :f :partition-stop}
(gen/sleep 5)
{:type :info, :f :clock-scramble}])
:nemesis (nemesis/compose
{{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves)
{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})})
(defn scenario-r
"Cluster reconfiguration scenario"
[opts]
{:generator (cycle [(gen/sleep 5)
{:type :info, :f :reconfigure-start}
(gen/sleep 5)
{:type :info, :f :reconfigure-start}
(gen/sleep 5)
{:type :info, :f :reconfigure-stop}])
:nemesis (nemesis/compose
{{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})})