mirror of
https://git.deuxfleurs.fr/Deuxfleurs/garage.git
synced 2025-01-24 23:18:16 +00:00
370 lines
10 KiB
TeX
370 lines
10 KiB
TeX
\nonstopmode
|
|
\documentclass[aspectratio=169]{beamer}
|
|
\usepackage[utf8]{inputenc}
|
|
% \usepackage[frenchb]{babel}
|
|
\usepackage{amsmath}
|
|
\usepackage{mathtools}
|
|
\usepackage{breqn}
|
|
\usepackage{multirow}
|
|
\usetheme{boxes}
|
|
\usepackage{graphicx}
|
|
\usepackage{import}
|
|
\usepackage{adjustbox}
|
|
%\useoutertheme[footline=authortitle,subsection=false]{miniframes}
|
|
%\useoutertheme[footline=authorinstitute,subsection=false]{miniframes}
|
|
\useoutertheme{infolines}
|
|
\setbeamertemplate{headline}{}
|
|
|
|
\beamertemplatenavigationsymbolsempty
|
|
|
|
\definecolor{TitleOrange}{RGB}{255,137,0}
|
|
\setbeamercolor{title}{fg=TitleOrange}
|
|
\setbeamercolor{frametitle}{fg=TitleOrange}
|
|
|
|
\definecolor{ListOrange}{RGB}{255,145,5}
|
|
\setbeamertemplate{itemize item}{\color{ListOrange}$\blacktriangleright$}
|
|
|
|
\definecolor{verygrey}{RGB}{70,70,70}
|
|
\setbeamercolor{normal text}{fg=verygrey}
|
|
|
|
|
|
\usepackage{tabu}
|
|
\usepackage{multicol}
|
|
\usepackage{vwcol}
|
|
\usepackage{stmaryrd}
|
|
\usepackage{graphicx}
|
|
|
|
\usepackage[normalem]{ulem}
|
|
|
|
\AtBeginSection[]{
|
|
\begin{frame}
|
|
\vfill
|
|
\centering
|
|
\begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
|
|
\usebeamerfont{title}\insertsectionhead\par%
|
|
\end{beamercolorbox}
|
|
\vfill
|
|
\end{frame}
|
|
}
|
|
|
|
\title{Garage}
|
|
\subtitle{a lightweight and robust geo-distributed data storage system}
|
|
\author{Alex Auvolat, Deuxfleurs}
|
|
\date{SEED webinar, 2024-01-12}
|
|
|
|
\begin{document}
|
|
|
|
% \begin{frame}
|
|
% \centering
|
|
% \includegraphics[width=.3\linewidth]{../../sticker/Garage.png}
|
|
% \vspace{1em}
|
|
%
|
|
% {\large\bf Alex Auvolat, Deuxfleurs Association}
|
|
% \vspace{1em}
|
|
%
|
|
% \url{https://garagehq.deuxfleurs.fr/}
|
|
%
|
|
% %Matrix channel: \texttt{\#garage:deuxfleurs.fr}
|
|
% \end{frame}
|
|
|
|
\begin{frame}
|
|
%\frametitle{Who I am}
|
|
\begin{columns}[t]
|
|
\begin{column}{.2\textwidth}
|
|
\centering
|
|
\adjincludegraphics[width=.4\linewidth, valign=t]{../assets/alex.jpg}
|
|
\end{column}
|
|
\begin{column}{.6\textwidth}
|
|
\textbf{Alex Auvolat}\\
|
|
Member of Deuxfleurs, lead developer of Garage
|
|
\end{column}
|
|
\begin{column}{.2\textwidth}
|
|
~
|
|
\end{column}
|
|
\end{columns}
|
|
\vspace{.5em}
|
|
|
|
\begin{columns}[t]
|
|
\begin{column}{.2\textwidth}
|
|
\centering
|
|
\adjincludegraphics[width=.6\linewidth, valign=t]{../../logo/garage-notext.png}
|
|
\end{column}
|
|
\begin{column}{.6\textwidth}
|
|
\\\textbf{Garage}\\
|
|
A self-hosted alternative to S3 for object storage
|
|
\end{column}
|
|
\begin{column}{.2\textwidth}
|
|
~
|
|
\end{column}
|
|
\end{columns}
|
|
\vspace{2em}
|
|
|
|
\begin{columns}[t]
|
|
\begin{column}{.2\textwidth}
|
|
\centering
|
|
\adjincludegraphics[width=.5\linewidth, valign=t]{../assets/deuxfleurs.pdf}
|
|
\end{column}
|
|
\begin{column}{.6\textwidth}
|
|
\textbf{Deuxfleurs}\\
|
|
A non-profit self-hosting collective,\\
|
|
member of the CHATONS network
|
|
\end{column}
|
|
\begin{column}{.2\textwidth}
|
|
\centering
|
|
\adjincludegraphics[width=.7\linewidth, valign=t]{../assets/logo_chatons.png}
|
|
\end{column}
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Stable vs Resilient}
|
|
|
|
\hspace{1em}
|
|
\begin{minipage}{7cm}
|
|
\textbf{Building a "stable" system:}
|
|
\vspace{1em}
|
|
|
|
Enterprise-grade systems typically employ:
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item RAID
|
|
\item Redundant power grid + UPS
|
|
\item Redundant Internet connections
|
|
\item Low-latency links
|
|
\item ...
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
$\to$ costly, only worth at DC scale\\
|
|
$\to$ still risk of DC-level incident...
|
|
\end{minipage}
|
|
\hfill
|
|
\begin{minipage}{7cm}
|
|
\textbf{Building a \underline{resilient} system:}
|
|
\vspace{1em}
|
|
|
|
An alternative, cheaper way:
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item Commodity hardware \\(e.g. old desktop PCs)
|
|
\vspace{.5em}
|
|
\item Commodity Internet \\(e.g. FTTB, FTTH) and power grid
|
|
\vspace{.5em}
|
|
\item \textbf{Geographical redundancy} \\(multi-site replication)
|
|
\end{itemize}
|
|
\vspace{1.5em}
|
|
\end{minipage}
|
|
\hspace{1em}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Example: our infrastructure at Deuxfleurs}
|
|
\only<1>{
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{../assets/neptune.jpg}
|
|
\end{center}
|
|
}
|
|
\only<2>{
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{../assets/atuin.jpg}
|
|
\end{center}
|
|
}
|
|
\only<3>{
|
|
\begin{center}
|
|
\includegraphics[width=.8\linewidth]{../assets/inframap_jdll2023.pdf}
|
|
\end{center}
|
|
}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Object storage: simpler than file systems}
|
|
|
|
\begin{minipage}{6cm}
|
|
Only two operations:
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item Put an object at a key
|
|
\vspace{1em}
|
|
\item Retrieve an object from its key
|
|
\end{itemize}
|
|
\vspace{1em}
|
|
{\footnotesize (and a few others)}
|
|
|
|
\vspace{1em}
|
|
Sufficient for many applications!
|
|
\end{minipage}
|
|
\hfill
|
|
\begin{minipage}{8cm}
|
|
\begin{center}
|
|
\vspace{2em}
|
|
\includegraphics[height=6em]{../2020-12-02_wide-team/img/Amazon-S3.jpg}
|
|
\hspace{2em}
|
|
\includegraphics[height=5em]{../assets/minio.png}
|
|
|
|
\vspace{2em}
|
|
\includegraphics[height=6em]{../../logo/garage_hires_crop.png}
|
|
\end{center}
|
|
\vspace{1em}
|
|
\end{minipage}
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}
|
|
\frametitle{The data model of object storage}
|
|
Object storage is basically a key-value store:
|
|
\vspace{1em}
|
|
|
|
\begin{center}
|
|
\begin{tabular}{|l|p{8cm}|}
|
|
\hline
|
|
\textbf{Key: file path + name} & \textbf{Value: file data + metadata} \\
|
|
\hline
|
|
\hline
|
|
\texttt{index.html} &
|
|
\texttt{Content-Type: text/html; charset=utf-8} \newline
|
|
\texttt{Content-Length: 24929} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\texttt{img/logo.svg} &
|
|
\texttt{Content-Type: text/svg+xml} \newline
|
|
\texttt{Content-Length: 13429} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\texttt{download/index.html} &
|
|
\texttt{Content-Type: text/html; charset=utf-8} \newline
|
|
\texttt{Content-Length: 26563} \newline
|
|
\texttt{<binary blob>} \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Implementation: consensus vs weak consistency}
|
|
|
|
\hspace{1em}
|
|
\begin{minipage}{7cm}
|
|
\textbf{Consensus-based systems:}
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item \textbf{Leader-based:} a leader is elected to coordinate
|
|
all reads and writes
|
|
\vspace{1em}
|
|
\item Allows for \textbf{sequential reasoning}:
|
|
program as if running on a single machine
|
|
\vspace{1em}
|
|
\item Serializability is one of the \\
|
|
\textbf{strongest consistency guarantees}
|
|
\vspace{1em}
|
|
\item \textbf{Costly}, the leader is a bottleneck;
|
|
leader elections on failure take time
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\hfill
|
|
\begin{minipage}{7cm} \visible<2->{
|
|
\textbf{Weakly consistent systems:}
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item \textbf{Nodes are equivalent}, any node
|
|
can originate a read or write operation
|
|
\vspace{1em}
|
|
\item \textbf{Operations must be independent},
|
|
conflicts are resolved after the fact
|
|
\vspace{1em}
|
|
\item Strongest achievable consistency:\\
|
|
\textbf{read-after-write consistency}\\(using quorums)
|
|
\vspace{1em}
|
|
\item \textbf{Fast}, no single bottleneck;\\
|
|
works transparently with offline nodes
|
|
\end{itemize}
|
|
} \end{minipage}
|
|
\hspace{1em}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Why avoid consensus?}
|
|
Consensus can be implemented reasonably well in practice, so why avoid it?
|
|
\vspace{2em}
|
|
\begin{itemize}
|
|
\item \textbf{Software complexity:} RAFT and PAXOS are complex beasts;\\
|
|
harder to prove, harder to reason about
|
|
\vspace{1.5em}
|
|
\item \textbf{Performance issues:}
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item Taking a decision may take an \textbf{arbitrary number of steps} (in adverse scenarios)
|
|
\vspace{1em}
|
|
\item The leader is a \textbf{bottleneck} for all requests;\\
|
|
even in leaderless approaches, \textbf{all nodes must process all operations in order}
|
|
\vspace{1em}
|
|
\item Particularly \textbf{sensitive to higher latency} between nodes
|
|
\end{itemize}
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{Objective: the right level of consistency for Garage}
|
|
|
|
\underline{Constraints:} slow network (geographical distance), node unavailability/crashes\\
|
|
\underline{Objective:} maximize availability, maintain an \emph{appropriate level of consistency}\\
|
|
\vspace{1em}
|
|
\begin{enumerate}
|
|
\item<2-> \textbf{Weak consistency for most things}\\
|
|
\vspace{1em}
|
|
\underline{Example:} \texttt{PutObject}\\
|
|
\vspace{.5em}
|
|
If two clients write the same
|
|
object at the same time, one of the two is implicitly overwritten.
|
|
No need to coordinate, use a \emph{last-writer-wins register}.
|
|
\vspace{1em}
|
|
\item<3-> \textbf{Stronger consistency only when necessary}\\
|
|
\vspace{1em}
|
|
\underline{Example:} \texttt{CreateBucket}\\
|
|
\vspace{.5em}
|
|
A bucket is a reserved name in a shared namespace,
|
|
two clients should be prevented from both creating the same bucket
|
|
(\emph{mutual exclusion}).
|
|
\end{enumerate}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{The possibility of \emph{leaderless consensus}}
|
|
Currently, Garage \emph{only has weak consistency}. Is fast, but \texttt{CreateBucket} is broken!
|
|
|
|
\visible<2->{
|
|
\vspace{1em}
|
|
Leaderless consensus (Antoniadis et al., 2023) alleviates issues with RAFT and PAXOS:
|
|
\vspace{1em}
|
|
\begin{itemize}
|
|
\item \textbf{No leader.} All nodes participate equally at each time step,
|
|
and different nodes can be unavailable at different times without issues.
|
|
\\ \vspace{.5em} $\to$ better tolerance to the high latency (remove bottleneck issue)
|
|
\\ $\to$ tolerates crash transparently
|
|
\vspace{1em}
|
|
\item \textbf{Simpler formalization.} The algorithm is very simple to express and to analyze in mathematical terms.
|
|
\end{itemize}
|
|
}
|
|
\visible<3->{
|
|
\vspace{1em}
|
|
One of the possible subjects for this PhD:
|
|
\\$\to$ \emph{integration of leaderless consensus in Garage} + testing + perf eval, etc.
|
|
}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\includegraphics[width=.25\linewidth]{../../logo/garage_hires.png}\\
|
|
\vspace{-1em}
|
|
\url{https://garagehq.deuxfleurs.fr/}\\
|
|
\url{mailto:garagehq@deuxfleurs.fr}\\
|
|
\texttt{\#garage:deuxfleurs.fr} on Matrix
|
|
|
|
\vspace{1.5em}
|
|
\includegraphics[width=.06\linewidth]{../assets/rust_logo.png}
|
|
\includegraphics[width=.13\linewidth]{../assets/AGPLv3_Logo.png}
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\end{document}
|
|
|
|
%% vim: set ts=4 sw=4 tw=0 noet spelllang=en :
|