2021-03-22
§
|
10:47 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
10:34 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] DONE helmfile.d/admin 'sync'. |
[production] |
10:33 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] START helmfile.d/admin 'sync'. |
[production] |
10:32 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] DONE helmfile.d/admin 'sync'. |
[production] |
10:32 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] START helmfile.d/admin 'sync'. |
[production] |
10:27 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] DONE helmfile.d/admin 'sync'. |
[production] |
10:26 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
10:26 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] DONE helmfile.d/admin 'sync'. |
[production] |
10:25 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
10:17 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] DONE helmfile.d/admin 'sync'. |
[production] |
10:17 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
10:15 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] DONE helmfile.d/admin 'sync'. |
[production] |
10:15 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
10:12 |
<elukey> |
run homer for cr1/cr2 eqiad and codfw to add new iBGP session for the k8s ML clusters - https://gerrit.wikimedia.org/r/c/operations/homer/public/+/661055 |
[production] |
07:51 |
<elukey> |
stop/start mariadb instances on dbstore1004 to reduce buffer pool memory settings - T273865 |
[production] |
2021-03-09
§
|
20:42 |
<elukey> |
reimaged an-worker1091 to buster |
[production] |
20:25 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1091.eqiad.wmnet with reason: REIMAGE |
[production] |
20:23 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1091.eqiad.wmnet with reason: REIMAGE |
[production] |
18:52 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1087.eqiad.wmnet with reason: REIMAGE |
[production] |
18:49 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1087.eqiad.wmnet with reason: REIMAGE |
[production] |
18:26 |
<elukey> |
reimage an-worker1087 to buster |
[production] |
18:02 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1085.eqiad.wmnet with reason: REIMAGE |
[production] |
18:00 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1085.eqiad.wmnet with reason: REIMAGE |
[production] |
17:01 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on analytics1077.eqiad.wmnet with reason: REIMAGE |
[production] |
16:59 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on analytics1077.eqiad.wmnet with reason: REIMAGE |
[production] |
16:40 |
<elukey> |
reimage analytics1077 to buster |
[production] |
15:45 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on analytics1072.eqiad.wmnet with reason: REIMAGE |
[production] |
15:43 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on analytics1072.eqiad.wmnet with reason: REIMAGE |
[production] |
15:18 |
<elukey> |
reimage analytics1072 (hadoop hdfs journal node) to buster |
[production] |
14:56 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1089.eqiad.wmnet with reason: REIMAGE |
[production] |
14:54 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1090.eqiad.wmnet with reason: REIMAGE |
[production] |
14:53 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1089.eqiad.wmnet with reason: REIMAGE |
[production] |
14:52 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1090.eqiad.wmnet with reason: REIMAGE |
[production] |
14:29 |
<elukey> |
drain + reimage an-worker1090/89 to Buster |
[production] |
13:52 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1102.eqiad.wmnet with reason: REIMAGE |
[production] |
13:50 |
<elukey@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-worker1080.eqiad.wmnet with reason: REIMAGE |
[production] |
13:49 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1102.eqiad.wmnet with reason: REIMAGE |
[production] |
13:49 |
<elukey@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-worker1080.eqiad.wmnet with reason: REIMAGE |
[production] |
13:27 |
<elukey> |
reimage an-worker1102 and an-worker1080 (hdfs journal node) to Buster |
[production] |