2021-11-25
ยง
|
08:40 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'linkrecommendation' for release 'internal' . |
[production] |
08:40 |
<ladsgroup@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 4:00:00 on db1146.eqiad.wmnet with reason: Maintenance T296143 |
[production] |
08:39 |
<ladsgroup@cumin1001> |
START - Cookbook sre.hosts.downtime for 4:00:00 on db1146.eqiad.wmnet with reason: Maintenance T296143 |
[production] |
08:37 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventstreams-internal' for release 'main' . |
[production] |
08:34 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventstreams' for release 'production' . |
[production] |
08:34 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventstreams' for release 'canary' . |
[production] |
08:31 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-logging-external' for release 'canary' . |
[production] |
08:31 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-logging-external' for release 'production' . |
[production] |
08:28 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-analytics-external' for release 'production' . |
[production] |
08:28 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-analytics-external' for release 'canary' . |
[production] |
08:25 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-analytics' for release 'canary' . |
[production] |
08:25 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-analytics' for release 'production' . |
[production] |
08:22 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] DONE helmfile.d/admin 'sync'. |
[production] |
08:22 |
<elukey@deploy1002> |
helmfile [ml-serve-eqiad] START helmfile.d/admin 'sync'. |
[production] |
08:21 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] DONE helmfile.d/admin 'sync'. |
[production] |
08:21 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'cxserver' for release 'production' . |
[production] |
08:21 |
<elukey@deploy1002> |
helmfile [ml-serve-codfw] START helmfile.d/admin 'sync'. |
[production] |
08:18 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'citoid' for release 'production' . |
[production] |
08:17 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'citoid' for release 'production' . |
[production] |
08:14 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'changeprop-jobqueue' for release 'production' . |
[production] |
08:13 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'changeprop' for release 'production' . |
[production] |
08:09 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'changeprop' for release 'production' . |
[production] |
08:08 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'changeprop' for release 'production' . |
[production] |
08:05 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'blubberoid' for release 'production' . |
[production] |
08:03 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'apple-search' for release 'main' . |
[production] |
08:02 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'apple-search' for release 'main' . |
[production] |
08:00 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'api-gateway' for release 'production' . |
[production] |
07:57 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'apertium' for release 'production' . |
[production] |
07:56 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'apertium' for release 'production' . |
[production] |
07:53 |
<marostegui@cumin1001> |
END (PASS) - Cookbook sre.hosts.reimage (exit_code=0) for host db1128.eqiad.wmnet with OS bullseye |
[production] |
07:51 |
<jelto@cumin1001> |
conftool action : set/pooled=true; selector: name=eqiad,dnsdisc=(echostore|sessionstore) |
[production] |
07:49 |
<marostegui> |
Stop mysql on db1133 to clone db1128 as a test host T295965 |
[production] |
07:49 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'sessionstore' for release 'production' . |
[production] |
07:48 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'sessionstore' for release 'production' . |
[production] |
07:47 |
<jayme> |
elevated MediaWiki exceptions and fatals (from ~07:35) due to a mistake during re-deploy of eventgate-main |
[production] |
07:45 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'echostore' for release 'production' . |
[production] |
07:35 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-main' for release 'production' . |
[production] |
07:32 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-main' for release 'canary' . |
[production] |
07:32 |
<jelto@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'eventgate-main' for release 'production' . |
[production] |
07:29 |
<elukey_> |
elukey@mwdebug2002:~$ sudo systemctl reset-failed ifup@ens5.service |
[production] |
07:27 |
<marostegui@cumin1001> |
START - Cookbook sre.hosts.reimage for host db1128.eqiad.wmnet with OS bullseye |
[production] |
07:23 |
<ladsgroup@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 4:00:00 on db1145.eqiad.wmnet with reason: Maintenance T296143 |
[production] |
07:23 |
<ladsgroup@cumin1001> |
START - Cookbook sre.hosts.downtime for 4:00:00 on db1145.eqiad.wmnet with reason: Maintenance T296143 |
[production] |
07:20 |
<jelto@cumin1001> |
conftool action : set/pooled=false; selector: name=eqiad,dnsdisc=(apertium|api-gateway|apple-search|blubberoid|citoid|cxserver|echostore|eventgate-analytics|eventgate-analytics-external|eventgate-logging-external|eventstreams|eventstreams-internal|linkrecommendation|mathoid|mobileapps|proton|push-notifications|recommendation-api|sessionstore|shellbox|shellbox-constraints|shellbox-media|shellbox-syntax |
[production] |
07:17 |
<jelto@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 3:00:00 on 32 hosts with reason: helm3 de-deploy T251305 |
[production] |
07:17 |
<jelto@cumin1001> |
START - Cookbook sre.hosts.downtime for 3:00:00 on 32 hosts with reason: helm3 de-deploy T251305 |
[production] |
07:10 |
<jelto> |
downtime PyBal backends health check on lvs1015 and lvs1016 for helm3 de-deploy T251305. I'm keeping an eye on icing and remove downtime as soon as I'm finished |
[production] |
07:09 |
<jelto> |
start re-deploy procedure in eqiad Kubernetes T251305 |
[production] |
06:31 |
<marostegui> |
Restart tendril's DB |
[production] |
05:51 |
<ryankemper> |
[WDQS Deploy] Deploy complete. Successful test query placed on query.wikidata.org, there's no relevant criticals in Icinga, and Grafana looks good |
[production] |