import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def read_metrics(ident, path):
with open(path, 'r') as f:
for line in f:
pieces = line.strip().split(':')
if len(pieces) == 2:
metric, value = pieces
elif len(pieces) == 3:
timestamp, metric, value = pieces
else:
raise Exception("Couldn't parse line: %s" % (line))
yield ident, metric, float(value)
def read_all_metrics(paths):
def inner():
for ident, path in paths:
yield from read_metrics(ident, path)
df = pd.DataFrame(inner())
df.columns = ('treatment', 'metric', 'value')
return df
def reject_outliers(df_in, k=3):
# k is the number of allowed standard deviations away from the mean
aggs = df_in.groupby(['treatment', 'metric']).aggregate([np.mean, np.std])
max_val = aggs['value']['mean'] + (k * aggs['value']['std'])
max_val.name = 'max_val'
min_val = aggs['value']['mean'] - (k * aggs['value']['std'])
min_val.name = 'min_val'
df_out = df_in.join(min_val, on=['treatment', 'metric']).join(max_val, on=['treatment', 'metric'])
condition = (df_out['min_val'] < df_out['value']) & (df_out['value'] < df_out['max_val'])
return df_out[condition][['treatment', 'metric', 'value']]
def report(path_format, n_indices):
df = read_all_metrics((index_count, path_format.format(index_count)) for index_count in n_indices)
df_agg = df.groupby(['metric', 'treatment']).mean().reset_index() \
.pivot(index='treatment', columns='metric', values='value')
# Data collection was bad and included fetch_cluster_state in move_shard collection
df_agg['move_shard'] -= df_agg['fetch_cluster_state']
# Not meaningful like this
del df_agg['dupe']
ax = df_agg.plot(xlim=(0,2800), ylim=(0, 8))
ax.set_ylabel('seconds')
def compare(*paths, max_std=None):
df = read_all_metrics((x, x) for x in paths)
if max_std is not None:
df = reject_outliers(df, max_std)
df_agg = (df[df['metric'] != 'dupe']
.groupby(['treatment', 'metric'])
.aggregate([np.min, np.mean, np.max, np.std])
.unstack('metric').transpose()
.reset_index(level=0, drop=True)
.rename_axis(['aggregation', 'metric'])
.reset_index()
.sort_values(['metric', 'aggregation'])
.set_index(['metric', 'aggregation']))
return df_agg
def detail_report(path):
df = read_all_metrics([(path, path)])
metrics = list(df['metric'].unique())
if 'create_archive' in metrics:
metrics.remove('create_archive')
metrics.append('create_archive')
height = int(math.ceil(len(metrics)/2.0))
fig, axes = plt.subplots(height, 4, figsize=(16, 3 * height))
fig.suptitle(path)
for i, metric in enumerate(metrics):
x = i // 2
y = 0 if i % 2 == 0 else 2
df_metric = df[df['metric'] == metric].reset_index(drop=True)
axes[x][y].set_title('{} hist'.format(metric))
df_metric.value.plot.hist(ax=axes[x][y])
axes[x][y + 1].set_title('{} over time'.format(metric))
axes[x][y + 1].set_ylabel('seconds')
df_metric.value.plot(ax=axes[x][y + 1])
plt.tight_layout()
plt.subplots_adjust(top=0.90)
These first two graphs are from a 4 node cluster running on a laptop to check initial feasability and develop measurement software. They y axis is the mean number of seconds to complete an operation. The x axis is the number of non-archive indices that exist. The first graph has, in addition to the specified number of indices, an expected (1/3 of index count) number of additional archive indices that we are measuring the impact of adding.
report('latencies-archive-{}', (10, 30, 90, 270, 810, 1620, 2798))
report('latencies-{}', (10, 30, 90, 270, 810, 1620))
Compare latencies of various operations on the clusters under different treatments. codfw tests run from wasat. eqiad tests run from terbium. Minor changes in the data collection were made between runs. all values reported here are in seconds.
Race conditions (and sub-par data collection) mean that sometimes actions that cause the cluster to go green->yellow, such as adding a replica, are recorded into the next action that waits for green.
from IPython.display import display
old_reports = ('codfw-with-archive', 'codfw-default', 'eqiad-default', 'eqiad-with-archive', 'eqiad-with-archive-again', 'eqiad-with-2x-archive')
reports = ['{}-{}{}'.format(cluster, dc, suffix)
for dc in ('eqiad', 'codfw')
for cluster in ('psi', 'chi', 'omega')
for suffix in ('', '-with-archive')]
for dc in ['eqiad', 'codfw']:
for cluster in ('psi', 'chi', 'omega'):
display(compare(*('{}-{}{}'.format(cluster, dc, suffix) for suffix in ('', '-with-archive'))))
treatment | psi-eqiad | psi-eqiad-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 1.445567 | 2.459350 |
amin | 0.284007 | 0.412462 | |
mean | 0.453486 | 0.845574 | |
std | 0.173732 | 0.442530 | |
create_archive | amax | NaN | 9.733179 |
amin | NaN | 0.676889 | |
mean | NaN | 1.181620 | |
std | NaN | 0.980406 | |
create_index | amax | 1.748412 | 36.475646 |
amin | 0.798888 | 0.981279 | |
mean | 1.055407 | 2.470375 | |
std | 0.214126 | 4.307091 | |
delete_index | amax | 2.353573 | 2.818253 |
amin | 0.260263 | 0.365700 | |
mean | 0.541047 | 0.988521 | |
std | 0.350684 | 0.583953 | |
fetch_cluster_state | amax | 4.362320 | 4.819047 |
amin | 2.300280 | 2.918922 | |
mean | 3.135959 | 3.785449 | |
std | 0.366079 | 0.423175 | |
move_shard | amax | 1.651968 | 4.231689 |
amin | 0.378666 | 0.935276 | |
mean | 0.939027 | 1.488404 | |
std | 0.185134 | 0.476390 | |
remove_replica | amax | 0.686313 | 2.077054 |
amin | 0.266160 | 0.363733 | |
mean | 0.376752 | 0.733203 | |
std | 0.094806 | 0.341195 |
treatment | chi-eqiad | chi-eqiad-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 3.952081 | 46.336351 |
amin | 0.485043 | 0.409097 | |
mean | 1.009359 | 1.284418 | |
std | 0.469344 | 4.626562 | |
create_archive | amax | NaN | 17.250039 |
amin | NaN | 0.314673 | |
mean | NaN | 1.090191 | |
std | NaN | 1.723468 | |
create_index | amax | 63.110431 | 194.680247 |
amin | 0.980831 | 0.846730 | |
mean | 2.711586 | 3.913923 | |
std | 6.131714 | 19.300513 | |
delete_index | amax | 6.682615 | 6.113952 |
amin | 0.253268 | 0.310206 | |
mean | 1.444515 | 1.083327 | |
std | 1.049900 | 1.066920 | |
fetch_cluster_state | amax | 2.722839 | 4.176132 |
amin | 1.641279 | 2.088655 | |
mean | 2.127756 | 2.707942 | |
std | 0.268448 | 0.350443 | |
move_shard | amax | 4.061344 | 6.289596 |
amin | 0.781141 | 0.569589 | |
mean | 1.346381 | 1.280889 | |
std | 0.488110 | 0.676869 | |
remove_replica | amax | 9.191319 | 5.000494 |
amin | 0.240559 | 0.228756 | |
mean | 0.984504 | 0.703113 | |
std | 1.110422 | 0.739870 |
treatment | omega-eqiad | omega-eqiad-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 1.330829 | 3.313551 |
amin | 0.389913 | 0.454293 | |
mean | 0.559073 | 0.817754 | |
std | 0.207199 | 0.524519 | |
create_archive | amax | NaN | 15.464149 |
amin | NaN | 0.600245 | |
mean | NaN | 1.165978 | |
std | NaN | 1.033703 | |
create_index | amax | 2.403932 | 291.071260 |
amin | 0.742246 | 1.021956 | |
mean | 1.265074 | 4.681635 | |
std | 0.413879 | 28.974827 | |
delete_index | amax | 1.263980 | 6.491376 |
amin | 0.329946 | 0.446455 | |
mean | 0.505980 | 0.882809 | |
std | 0.207916 | 0.816100 | |
fetch_cluster_state | amax | 4.089278 | 4.843013 |
amin | 2.650902 | 3.095127 | |
mean | 3.203850 | 3.996883 | |
std | 0.377167 | 0.454029 | |
move_shard | amax | 1.241029 | 1.711373 |
amin | 0.517315 | 1.323078 | |
mean | 1.067055 | 1.483154 | |
std | 0.121189 | 0.124994 | |
remove_replica | amax | 1.596538 | 4.506533 |
amin | 0.331686 | 0.438639 | |
mean | 0.463270 | 0.720094 | |
std | 0.196985 | 0.642949 |
treatment | psi-codfw | psi-codfw-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 1.545777 | 1.474143 |
amin | 0.511079 | 0.553371 | |
mean | 1.022711 | 1.164675 | |
std | 0.274513 | 0.263898 | |
create_archive | amax | NaN | 1.514391 |
amin | NaN | 0.440393 | |
mean | NaN | 0.648939 | |
std | NaN | 0.113343 | |
create_index | amax | 1.815361 | 1.858168 |
amin | 0.836244 | 0.924959 | |
mean | 1.266745 | 1.346399 | |
std | 0.185950 | 0.164690 | |
delete_index | amax | 0.747143 | 0.625578 |
amin | 0.320923 | 0.351336 | |
mean | 0.401220 | 0.388818 | |
std | 0.076002 | 0.037694 | |
fetch_cluster_state | amax | 4.695016 | 4.094146 |
amin | 2.377671 | 2.774941 | |
mean | 3.265336 | 3.361268 | |
std | 0.573914 | 0.306230 | |
move_shard | amax | 1.259534 | 1.374253 |
amin | 0.452415 | 0.530548 | |
mean | 0.925975 | 0.994980 | |
std | 0.207508 | 0.183639 | |
remove_replica | amax | 0.553617 | 0.693972 |
amin | 0.307576 | 0.348142 | |
mean | 0.359248 | 0.376709 | |
std | 0.030037 | 0.045333 |
treatment | chi-codfw | chi-codfw-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 1.848815 | 1.293268 |
amin | 0.482686 | 0.524209 | |
mean | 0.840587 | 0.791842 | |
std | 0.245589 | 0.116137 | |
create_archive | amax | NaN | 11.423560 |
amin | NaN | 0.404857 | |
mean | NaN | 0.707950 | |
std | NaN | 0.779678 | |
create_index | amax | 14.293614 | 2.857695 |
amin | 0.767582 | 0.777848 | |
mean | 1.146992 | 0.930634 | |
std | 1.353164 | 0.304025 | |
delete_index | amax | 1.049806 | 0.710945 |
amin | 0.305940 | 0.326263 | |
mean | 0.392095 | 0.387373 | |
std | 0.109561 | 0.068775 | |
fetch_cluster_state | amax | 3.319163 | 2.898446 |
amin | 1.762060 | 1.970118 | |
mean | 2.216643 | 2.370863 | |
std | 0.416274 | 0.223110 | |
move_shard | amax | 3.261147 | 1.028186 |
amin | 0.559117 | 0.497986 | |
mean | 0.821998 | 0.824132 | |
std | 0.315121 | 0.118986 | |
remove_replica | amax | 1.466925 | 1.405399 |
amin | 0.299062 | 0.316189 | |
mean | 0.377101 | 0.373123 | |
std | 0.125765 | 0.131555 |
treatment | omega-codfw | omega-codfw-with-archive | |
---|---|---|---|
metric | aggregation | ||
add_replica | amax | 1.782622 | 1.528175 |
amin | 0.494155 | 0.541910 | |
mean | 0.833955 | 0.957839 | |
std | 0.219049 | 0.197405 | |
create_archive | amax | NaN | 10.715171 |
amin | NaN | 0.448563 | |
mean | NaN | 0.710282 | |
std | NaN | 0.683334 | |
create_index | amax | 12.130562 | 1.417956 |
amin | 0.738262 | 0.875968 | |
mean | 1.065575 | 0.991173 | |
std | 1.135723 | 0.146910 | |
delete_index | amax | 0.906493 | 0.889648 |
amin | 0.304015 | 0.340288 | |
mean | 0.378573 | 0.389599 | |
std | 0.100293 | 0.074852 | |
fetch_cluster_state | amax | 4.802250 | 5.298962 |
amin | 2.723684 | 2.995567 | |
mean | 3.470338 | 3.768098 | |
std | 0.680308 | 0.541050 | |
move_shard | amax | 1.640414 | 1.418294 |
amin | 0.431839 | 0.539606 | |
mean | 0.918518 | 1.053511 | |
std | 0.211874 | 0.190068 | |
remove_replica | amax | 0.845920 | 0.426370 |
amin | 0.294826 | 0.336764 | |
mean | 0.346358 | 0.362251 | |
std | 0.101348 | 0.015353 |
The rest of the graphs here show details about test runs in terms of a histogram and an over-time graph per metric
for report in reports:
detail_report(report)