mirror of
https://git.deuxfleurs.fr/Deuxfleurs/garage.git
synced 2024-11-25 01:21:01 +00:00
Talk 2023-01-18: some WIP talking about consensus
This commit is contained in:
parent
7416ba97ef
commit
fe850f62c9
6 changed files with 315 additions and 21 deletions
|
@ -3,6 +3,7 @@ ASSETS=assets/consistent_hashing_1.pdf \
|
|||
assets/consistent_hashing_3.pdf \
|
||||
assets/consistent_hashing_4.pdf \
|
||||
assets/garage_tables.pdf \
|
||||
assets/consensus.pdf_tex \
|
||||
assets/deuxfleurs.pdf
|
||||
|
||||
talk.pdf: talk.tex $(ASSETS)
|
||||
|
@ -10,3 +11,6 @@ talk.pdf: talk.tex $(ASSETS)
|
|||
|
||||
assets/%.pdf: assets/%.svg
|
||||
inkscape -D -z --file=$^ --export-pdf=$@
|
||||
|
||||
assets/%.pdf_tex: assets/%.svg
|
||||
inkscape -D -z --file=$^ --export-pdf=$@ --export-latex
|
||||
|
|
137
doc/talks/2023-01-18-tocatta/assets/consensus.svg
Normal file
137
doc/talks/2023-01-18-tocatta/assets/consensus.svg
Normal file
|
@ -0,0 +1,137 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
width="800"
|
||||
height="300"
|
||||
viewBox="0 0 211.66666 79.374999"
|
||||
version="1.1"
|
||||
id="svg5"
|
||||
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
|
||||
sodipodi:docname="consensus.svg"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg">
|
||||
<sodipodi:namedview
|
||||
id="namedview7"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:showpageshadow="2"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pagecheckerboard="0"
|
||||
inkscape:deskcolor="#d1d1d1"
|
||||
inkscape:document-units="mm"
|
||||
showgrid="false"
|
||||
inkscape:zoom="1.4734708"
|
||||
inkscape:cx="310.49139"
|
||||
inkscape:cy="179.1688"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="999"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="layer1" />
|
||||
<defs
|
||||
id="defs2">
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto-start-reverse"
|
||||
inkscape:stockid="Arrow2"
|
||||
markerWidth="7.6999998"
|
||||
markerHeight="5.5999999"
|
||||
viewBox="0 0 7.7 5.6"
|
||||
inkscape:isstock="true"
|
||||
inkscape:collect="always"
|
||||
preserveAspectRatio="xMidYMid">
|
||||
<path
|
||||
transform="scale(0.7)"
|
||||
d="M -2,-4 9,0 -2,4 c 2,-2.33 2,-5.66 0,-8 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:none"
|
||||
id="arrow2L" />
|
||||
</marker>
|
||||
</defs>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1">
|
||||
<g
|
||||
id="g1218"
|
||||
transform="translate(-8.9161476,-12.502301)">
|
||||
<circle
|
||||
style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1"
|
||||
id="path111"
|
||||
cx="38.904896"
|
||||
cy="37.936272"
|
||||
r="13.474442" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||
x="38.879501"
|
||||
y="40.908073"
|
||||
id="text1105"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan1103"
|
||||
style="stroke-width:0.264583;fill:#000000"
|
||||
x="38.879501"
|
||||
y="40.908073">$\bot$</tspan></text>
|
||||
</g>
|
||||
<g
|
||||
id="g1218-3"
|
||||
transform="translate(127.41938,-12.502301)">
|
||||
<circle
|
||||
style="fill:#ffffff;stroke:#000000;stroke-width:1;stroke-dasharray:none;stroke-opacity:1"
|
||||
id="path111-5"
|
||||
cx="38.904896"
|
||||
cy="37.936272"
|
||||
r="13.474442" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||
x="38.879501"
|
||||
y="40.908073"
|
||||
id="text1105-6"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan1103-2"
|
||||
style="stroke-width:0.264583;fill:#000000"
|
||||
x="38.879501"
|
||||
y="40.908073">$x$</tspan></text>
|
||||
</g>
|
||||
<path
|
||||
style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)"
|
||||
d="M 44.289635,25.433971 H 145.90576"
|
||||
id="path1414"
|
||||
sodipodi:nodetypes="cc" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||
x="92.729836"
|
||||
y="21.781803"
|
||||
id="text2092"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan2090"
|
||||
style="stroke-width:0.264583;fill:#000000"
|
||||
x="92.729836"
|
||||
y="21.781803">$propose(x) / x$</tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-size:8.46667px;line-height:1.25;font-family:sans-serif;text-align:center;text-anchor:middle;stroke-width:0.264583;fill:#000000"
|
||||
x="166.29887"
|
||||
y="69.89299"
|
||||
id="text2092-9"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan2090-1"
|
||||
style="stroke-width:0.264583;fill:#000000"
|
||||
x="166.29887"
|
||||
y="69.89299">$propose(y) / x$</tspan></text>
|
||||
<path
|
||||
style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-dasharray:none;stroke-opacity:1;marker-end:url(#Arrow2)"
|
||||
d="m 155.82329,35.899857 c -8.35129,12.319651 0.54055,24.640898 11.72797,24.072085 8.65403,-0.440005 18.59818,-11.705963 11.8146,-20.570891"
|
||||
id="path2150"
|
||||
sodipodi:nodetypes="csc" />
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 4.7 KiB |
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 13 KiB |
Binary file not shown.
|
@ -8,6 +8,7 @@
|
|||
\usepackage{multirow}
|
||||
\usetheme{boxes}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{import}
|
||||
\usepackage{adjustbox}
|
||||
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
|
||||
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
|
||||
|
@ -479,33 +480,185 @@
|
|||
|
||||
\section{Problem 2: ensuring consistency}
|
||||
|
||||
%\begin{frame}
|
||||
% \frametitle{Garage's architecture}
|
||||
% \begin{center}
|
||||
% \includegraphics[width=.35\linewidth]{assets/garage.drawio.pdf}
|
||||
% \end{center}
|
||||
%\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Garage is \emph{coordination-free}:}
|
||||
\frametitle{Consensus vs weak consistency}
|
||||
|
||||
\hspace{1em}
|
||||
\begin{minipage}{7cm}
|
||||
\textbf{Consensus-based systems:}
|
||||
\vspace{1em}
|
||||
\begin{itemize}
|
||||
\item No Raft or Paxos
|
||||
\item \textbf{Leader-based:} a leader is elected to coordinate
|
||||
all reads and writes
|
||||
\vspace{1em}
|
||||
\item Internal data types are CRDTs
|
||||
\item \textbf{Linearizability} of all operations\\
|
||||
(strongest consistency guarantee)
|
||||
\vspace{1em}
|
||||
\item All nodes are equivalent (no master/leader/index node)
|
||||
\item \textbf{Replicated state machines} that can implement
|
||||
any sequential specification
|
||||
\vspace{1em}
|
||||
\item \textbf{Costly}, the leader is a bottleneck;
|
||||
leader elections on failure take time
|
||||
\end{itemize}
|
||||
\vspace{2em}
|
||||
$\to$ less sensitive to higher latencies between nodes
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{7cm} \visible<2->{
|
||||
\textbf{Weakly consistent systems:}
|
||||
\vspace{1em}
|
||||
\begin{itemize}
|
||||
\item \textbf{Nodes are equivalent}, any node
|
||||
can originate a read or write operation
|
||||
\vspace{1em}
|
||||
\item \textbf{Read-after-write consistency} with quorums,
|
||||
eventual consistency without
|
||||
\vspace{1em}
|
||||
\item \textbf{Operations have to commute}, i.e.~we
|
||||
can only implement CRDTs
|
||||
\vspace{1em}
|
||||
\item \textbf{Fast}, no node is a bottleneck;\\
|
||||
works the same with offline nodes
|
||||
\end{itemize}
|
||||
} \end{minipage}
|
||||
\hspace{1em}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Consistency model}
|
||||
\begin{itemize}
|
||||
\item Not ACID (not required by S3 spec) / not linearizable
|
||||
\frametitle{Consensus vs weak consistency}
|
||||
\begin{center}
|
||||
\textbf{The same objects cannot be implemented in both models.}
|
||||
\end{center}
|
||||
\vspace{2em}
|
||||
|
||||
\hspace{1em}
|
||||
\begin{minipage}{7cm}
|
||||
\underline{Consensus-based systems:}
|
||||
|
||||
\vspace{1em}
|
||||
\item \textbf{Read-after-write consistency}\\
|
||||
{\footnotesize (stronger than eventual consistency)}
|
||||
|
||||
\textbf{Any sequential specification}\\~
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{7cm}
|
||||
\underline{Weakly consistent systems:}
|
||||
|
||||
\vspace{1em}
|
||||
|
||||
\textbf{CRDTs only}\\(conflict-free replicated data types)
|
||||
\end{minipage}
|
||||
\hspace{1em}
|
||||
|
||||
\vspace{3em}
|
||||
\begin{center}
|
||||
Part of the complexity is \textbf{reported to the consumer of the API}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Consensus vs weak consistency}
|
||||
\begin{center}
|
||||
\textbf{From a theoretical point of view:}\\
|
||||
|
||||
\end{center}
|
||||
\vspace{2em}
|
||||
|
||||
\hspace{1em}
|
||||
\begin{minipage}{6.5cm}
|
||||
\underline{Consensus-based systems:}
|
||||
|
||||
\vspace{1em}
|
||||
|
||||
Require \textbf{additionnal assumptions} such as a fault detector or a strong RNG\\~
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{6.5cm}
|
||||
\underline{Weakly consistent systems:}
|
||||
|
||||
\vspace{1em}
|
||||
|
||||
Can be implemented in \textbf{any asynchronous message passing distributed system}
|
||||
\end{minipage}
|
||||
\hspace{1em}
|
||||
|
||||
\vspace{3em}
|
||||
\begin{center}
|
||||
They represent \textbf{different classes of computational capability}
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Understanding the power of consensus}
|
||||
\textbf{Consensus:} an API with a single operation, $propose(x)$
|
||||
\begin{enumerate}
|
||||
\item nodes all call $propose(x)$ with their proposed value;
|
||||
\item nodes all receive the same value as a return value, which is one of the proposed values
|
||||
\end{enumerate}
|
||||
\vspace{1em}
|
||||
|
||||
\visible<2->{
|
||||
\textbf{Equivalent to} a distributed algorithm that gives a total order on all requests
|
||||
}
|
||||
\vspace{1em}
|
||||
|
||||
\visible<3->{
|
||||
\textbf{Implemented by} this simple replicated state machine:
|
||||
\vspace{.5em}
|
||||
\begin{figure}
|
||||
\centering
|
||||
\def\svgwidth{.5\textwidth}
|
||||
\large
|
||||
\import{assets/}{consensus.pdf_tex}
|
||||
\end{figure}
|
||||
\vspace{1em}
|
||||
}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Can my object be implemented without consensus?}
|
||||
\underline{Given the specification of an API:}
|
||||
\vspace{2em}
|
||||
\begin{itemize}
|
||||
\item \textbf{Using this API, we can implement the consensus object} (the $propose$ function)\\
|
||||
$\to$ the API is equivalent to consensus/total ordering of messages\\
|
||||
$\to$ the API cannot be implemented in a weakly consistent system
|
||||
\vspace{2em}
|
||||
\item \textbf{This API can be implemented using only weak primitives}\\
|
||||
(e.g. a bunch of atomic registers)\\
|
||||
$\to$ the API is strictly weaker than consensus\\
|
||||
$\to$ we can implement it in Garage!
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{Why avoid consensus?}
|
||||
Consensus can be implemented reasonably well in practice, so why avoid it?
|
||||
\vspace{2em}
|
||||
\begin{itemize}
|
||||
\item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\
|
||||
harder to prove, harder to reason about
|
||||
\vspace{1.5em}
|
||||
\item \textbf{Performance issues:}
|
||||
\vspace{1em}
|
||||
\begin{itemize}
|
||||
\item The leader is a \textbf{bottleneck} for all requests
|
||||
\vspace{1em}
|
||||
\item Particularly \textbf{sensitive to higher latency} between nodes
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{What can we implement without consensus?}
|
||||
\begin{itemize}
|
||||
\item Any \textbf{conflict-free replicated data type} (CRDT)
|
||||
\vspace{1em}
|
||||
\item Non-transactional key-value stores such as S3 are equivalent to a simple CRDT:\\
|
||||
a \textbf{last-writer-wins registry}
|
||||
\vspace{1em}
|
||||
\item \textbf{Read-after-write consistency} can be implemented
|
||||
using quorums on read and write operations
|
||||
\vspace{1em}
|
||||
\item \textbf{Monotonicity of reads} can be implemented with repair-on-read\\
|
||||
(makes reads more costly, not implemented in Garage)
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
|
Loading…
Reference in a new issue