Analyzing results, tester.py fixes.

arssher · arssher · commit b1596221704a · 2017-10-19T15:32:45.000+03:00
diff --git a/devops/analyze.sql b/devops/analyze.sql
@@ -0,0 +1,123 @@
+drop table if exists shmn_benchmarks cascade;
+create table shmn_benchmarks (
+	test_id text, instance_type text, workers int, nparts int, sharded_tables text,
+	replicas int, repmode text, sync_replicas bool, sync_commit text, CFLAGS text,
+	scale int, seconds int, test text, fdw_2pc bool, active_workers text, clients int,
+	tps_sum int, avg_latency numeric, end_latency numeric, wal_lag bigint,
+	comment text);
+copy shmn_benchmarks from '/home/ars/shmn_benchmarks.csv' with (format csv, header);
+
+select workers, nparts, repmode, sync_replicas, clients, tps_sum, pg_size_pretty(wal_lag) from shmn_benchmarks;
+
+-- only important fields
+drop view if exists shmn_bench;
+create view shmn_bench as select workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers, clients, tps_sum,
+		   pg_size_pretty(wal_lag) wal_lag
+	  from shmn_benchmarks;
+
+-- take only runs with number of clients maximizing tps
+-- for each set of rows which differ only by number of clients we take from
+-- window a row with max tps
+drop view if exists shmn_benchmarks_optimal_clients;
+create view shmn_benchmarks_optimal_clients as
+select distinct on (workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers)
+					workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers,
+				last_value(clients) over wnd clients,
+				last_value(tps_sum) over wnd tps_sum,
+				last_value(avg_latency) over wnd avg_latency,
+				last_value(end_latency) over wnd end_latency,
+				pg_size_pretty(last_value(wal_lag) over wnd) wal_lag
+  from shmn_benchmarks
+		   window wnd as
+		   (partition by workers, nparts, sharded_tables, repmode, sync_replicas, fdw_2pc, test, active_workers order by tps_sum
+		    rows between unbounded preceding and unbounded following);
+
+-- Create first() aggregate, taken from
+-- https://wiki.postgresql.org/wiki/First/last_(aggregate)
+-- Create a function that always returns the first non-NULL item
+CREATE OR REPLACE FUNCTION public.first_agg ( anyelement, anyelement )
+RETURNS anyelement LANGUAGE SQL IMMUTABLE STRICT AS $$
+        SELECT $1;
+$$;
+
+-- And then wrap an aggregate around it
+	drop aggregate public.first();
+	CREATE AGGREGATE public.FIRST (
+        sfunc    = public.first_agg,
+        basetype = anyelement,
+        stype    = anyelement
+		);
+
+-- flatten to compare no rep, trigger, sync and async, 2pc only
+select workers, nparts, (nparts / workers) nparts_per_node,
+	   first(tps_sum) filter (where repmode is null) no_rep_tps,
+	   first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
+	   first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
+	   first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
+	   first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
+  from shmn_benchmarks_optimal_clients
+ where active_workers = workers::text and fdw_2pc
+ group by workers, nparts;
+
+-- showing clients
+select workers, nparts, (nparts / workers) nparts_per_node,
+	   first(tps_sum) filter (where repmode is null) no_rep_tps,
+	   first(clients) filter (where repmode is null) no_rep_tps,
+	   first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
+	   first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
+	   first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
+	   first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
+  from shmn_benchmarks_optimal_clients
+ where active_workers = workers::text and fdw_2pc
+ group by workers, nparts;
+
+-- either with 2pc and not, showing it
+select workers, nparts, (nparts / workers) nparts_per_node,
+	   first(tps_sum) filter (where repmode is null) no_rep_tps,
+	   first(fdw_2pc) filter (where repmode is null) no_rep_2pc,
+	   first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
+	   first(fdw_2pc) filter (where repmode = 'trigger') trig_rep_2pc,
+	   first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
+	   first(fdw_2pc) filter (where repmode = 'logical' and sync_replicas) sync_rep_2pc,
+	   first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
+	   first(fdw_2pc) filter (where repmode = 'logical' and not sync_replicas) async_rep_2pc,
+	   first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
+  from shmn_benchmarks_optimal_clients
+ where active_workers = workers::text
+ group by workers, nparts;
+
+
+select workers, nparts, sharded_tables, repmode, sync_replicas, clients, tps_sum,
+	   wal_lag
+  from shmn_benchmarks_optimal_clients where active_workers = workers::text and fdw_2pc;
+
+select workers, nparts, sharded_tables, repmode, sync_replicas, fdw_2pc, test, clients, tps_sum,
+	   wal_lag
+  from shmn_benchmarks_optimal_clients where active_workers = workers::text and sharded_tables = 'pgbench_accounts';
+
+
+-- see, here lag increases only where there are too many clients already
+select * from shmn_bench where repmode = 'logical' and not sync_replicas and fdw_2pc and (
+	(workers = 3 and (nparts = 9 or nparts = 30)) or
+	(workers = 6 and (nparts = 6 or nparts = 12 or nparts = 18 or nparts = 60)) or
+	(workers = 9 and (nparts = 27 or nparts = 90)) or
+	(workers = 12)
+	)
+ order by workers, nparts, clients;
+
+-- same, only for 6:6 and 6:12
+select * from shmn_bench where repmode = 'logical' and not sync_replicas and fdw_2pc and (
+	(workers = 6 and (nparts = 6 or nparts = 12))
+	)
+ order by workers, nparts, clients;
+
+
+-- 2pc vs non-2pc
+select *, (s.no_two_pc_tps::numeric(10, 0) / s.two_pc_tps)::numeric(3, 2) no_two_pc_faster_times from
+(select workers, nparts, sharded_tables, repmode, sync_replicas, test, active_workers, clients,
+	   first(tps_sum) filter (where fdw_2pc) two_pc_tps,
+		first(tps_sum) filter (where not fdw_2pc) no_two_pc_tps
+  from shmn_bench
+ group by workers, nparts, sharded_tables, repmode, sync_replicas, test, active_workers, clients) s
+ where (s.two_pc_tps is not null and s.no_two_pc_tps is not null)
+	   order by workers, nparts, repmode, sync_replicas, active_workers, clients;
diff --git a/devops/logs.yml b/devops/logs.yml
@@ -27,6 +27,7 @@
     local_action: file
       src={{ inventory_hostname }}.log
       dest={{ logs }}/{{ node_id.stdout }}.log
+      force=yes
       state=link
 
   - name: touch logs/united.log
@@ -42,4 +43,5 @@
     local_action: file
       src={{ groups['shardlord'][0] }}.log
       dest={{ logs }}/shardlord.log
+      force=yes
       state=link
diff --git a/devops/tester.py b/devops/tester.py
@@ -5,35 +5,42 @@
 import csv
 import os
 import shutil
+import subprocess
 from subprocess import check_call, check_output
 
 from inventory_ec2.ec2_elect_shardlord import ec2_elect_shardlord
 
 scale = 10
 duration = 30
 
-workers = [3, 6, 9]
+workers = [6]
 # replications = ['none', 'logical_sync', 'logical_async', 'trigger']
-replications = ['logical_async']
+replications = ['logical_sync']
 async_config = '''
 synchronous_commit = local
 shardman.sync_replicas = off
 '''
-nparts_per_node = [3, 10]
+nparts_per_node = [2]
 clients = [1, 16, 32, 64, 128]
 
-# debug
-# workers = [3]
-# replications = ['logical_async']
-# nparts_per_node = [3]
-# clients = [8]
-
 create_instances = True
 destroy_instances = True
 provision = True
 rebuild_shardman = True
 prepare = True
 
+
+# debug
+# workers = [3]
+# replications = ['logical_async']
+# nparts_per_node = [3]
+# clients = [8]
+# create_instances = False
+# destroy_instances = False
+# provision = False
+# rebuild_shardman = False
+# prepare = False
+
 resfile_path = "tester_res.csv"
 resfile_writer = None
 
@@ -48,10 +55,10 @@ def __init__(self):
         self.wal_lag = ''
 
 def res_header():
-    return ["test_id", "instance_type", "num of workers", "nparts", "replicas",
-            "repmode", "sync_replicas", "sync_commit", "CFLAGS", "scale",
+    return ["test_id", "instance_type", "num of workers", "nparts", "sharded tables",
+            "replicas", "repmode", "sync_replicas", "sync_commit", "CFLAGS", "scale",
             "seconds", "test", "fdw 2pc", "active_workers", "clients", "tps sum",
-            "avg latency", "end latency", "wal lag"]
+            "avg latency", "end latency", "wal lag", "comment"]
 
 def tester():
     if os.path.isfile(resfile_path):
@@ -139,11 +146,19 @@ def run(test_row):
             ssh {} 'nohup /home/ubuntu/monitor_wal_lag.sh > monitor_wal_lag.out 2>&1 &'
             '''.format(mon_node), shell=True)
 
-    run_output = check_output(
-        '''
-        ansible-playbook -i inventory_ec2/ pgbench_run.yml -e \
-        'tmstmp=true tname=t pgbench_opts="-N -c {} -T {}"'
-        '''.format(test_row.clients, duration), shell=True).decode("ascii")
+    try:
+        run_output = check_output(
+            '''
+            ansible-playbook -i inventory_ec2/ pgbench_run.yml -e \
+            'tmstmp=true tname=t pgbench_opts="-N -c {} -T {}"'
+            '''.format(test_row.clients, duration), shell=True, stderr=subprocess.STDOUT).decode("ascii")
+    except subprocess.CalledProcessError as e:
+        print('pgbench_run failed, stdout and stderr was:')
+        print(e.output.decode('ascii'))
+        raise e
+
+    print('Here is run output:')
+    print(run_output)
 
     # stop wal lag monitoring and pull the results
     if test_row.replication == 'logical_async':
@@ -152,11 +167,11 @@ def run(test_row):
         max_lag_bytes = int(check_output(
             "awk -v max=0 '{if ($1 > max) {max=$1} }END {print max}' wal_lag.txt",
             shell=True))
-        test_row.wal_lag = size_pretty(max_lag_bytes)
+        test_row.wal_lag = max_lag_bytes
+    else:
+        test_row.wal_lag = ''
 
 
-    print('Here is run output:')
-    print(run_output)
     test_id_re = re.compile('test_id is ([\w-]+)')
     test_id = test_id_re.search(run_output).group(1)
     print('test id is {}'.format(test_id))
@@ -190,17 +205,10 @@ def form_csv_row(test_row):
         sync_commit = 'local'
 
     return [test_row.test_id, 'c3.2xlarge', test_row.workers, test_row.workers * test_row.nparts_per_node,
-            replicas, repmode, sync_replicas, sync_commit, "-O2", scale, duration,
+            "pgbench_accounts", replicas, repmode, sync_replicas, sync_commit, "-O2", scale, duration,
             "pgbench -N", "on", test_row.workers, test_row.clients, test_row.tps_sum,
-            '', '', test_row.wal_lag]
-
-def size_pretty(size, precision=2):
-    suffixes = ['B','KB','MB','GB','TB']
-    suffixIndex = 0
-    while size > 1024 and suffixIndex < 4:
-        suffixIndex += 1
-        size = size / 1024.0
-    return "%.*f%s" % (precision, size, suffixes[suffixIndex])
+            '', '', test_row.wal_lag, '']
+
 
 if __name__ == '__main__':
     tester()