|
2026-03-23
§
|
| 16:55 |
<btullis@cumin1003> |
START - Cookbook sre.hosts.reboot-single for host dse-k8s-ctrl1001.eqiad.wmnet |
[production] |
| 16:53 |
<cgoubert@deploy2002> |
helmfile [eqiad] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:52 |
<cgoubert@deploy2002> |
helmfile [staging] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 16:52 |
<cgoubert@deploy2002> |
helmfile [staging] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:50 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host aux-k8s-worker1007.mgmt.eqiad.wmnet with chassis set policy GRACEFUL_RESTART |
[production] |
| 16:46 |
<elukey@cumin1003> |
END (PASS) - Cookbook sre.hosts.provision (exit_code=0) for host aux-k8s-worker1006.mgmt.eqiad.wmnet with chassis set policy GRACEFUL_RESTART |
[production] |
| 16:41 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host aux-k8s-worker1006.mgmt.eqiad.wmnet with chassis set policy GRACEFUL_RESTART |
[production] |
| 16:38 |
<volans@cumin2002> |
END (PASS) - Cookbook sre.hosts.reboot-single (exit_code=0) for host cloudcumin2001.codfw.wmnet |
[production] |
| 16:35 |
<cgoubert@deploy2002> |
helmfile [codfw] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 16:34 |
<elukey@cumin1003> |
END (FAIL) - Cookbook sre.hosts.provision (exit_code=99) for host aux-k8s-worker1006.mgmt.eqiad.wmnet with chassis set policy GRACEFUL_RESTART |
[production] |
| 16:34 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host aux-k8s-worker1006.mgmt.eqiad.wmnet with chassis set policy GRACEFUL_RESTART |
[production] |
| 16:34 |
<volans@cumin2002> |
START - Cookbook sre.hosts.reboot-single for host cloudcumin2001.codfw.wmnet |
[production] |
| 16:32 |
<cgoubert@deploy2002> |
helmfile [codfw] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:31 |
<cgoubert@deploy2002> |
helmfile [eqiad] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 16:30 |
<cgoubert@deploy2002> |
helmfile [eqiad] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:29 |
<fnegri@cumin1003> |
END (PASS) - Cookbook sre.hosts.remove-downtime (exit_code=0) for clouddb1023.eqiad.wmnet |
[production] |
| 16:29 |
<fnegri@cumin1003> |
START - Cookbook sre.hosts.remove-downtime for clouddb1023.eqiad.wmnet |
[production] |
| 16:28 |
<cgoubert@deploy2002> |
helmfile [staging] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 16:27 |
<cgoubert@deploy2002> |
helmfile [staging] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:24 |
<eevans@cumin1003> |
END (PASS) - Cookbook sre.hosts.remove-downtime (exit_code=0) for aqs1010.eqiad.wmnet |
[production] |
| 16:24 |
<eevans@cumin1003> |
START - Cookbook sre.hosts.remove-downtime for aqs1010.eqiad.wmnet |
[production] |
| 16:24 |
<dhinus> |
added komla to https://gitlab.wikimedia.org/groups/repos/cloud/-/group_members T420532 |
[admin] |
| 16:21 |
<jgreen@dns1004> |
END - running authdns-update |
[production] |
| 16:19 |
<jgreen@dns1004> |
START - running authdns-update |
[production] |
| 16:18 |
<cgoubert@deploy2002> |
helmfile [staging] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 16:17 |
<cgoubert@deploy2002> |
helmfile [staging] START helmfile.d/services/rest-gateway: apply |
[production] |
| 16:11 |
<fnegri@cumin1003> |
DONE (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 1:00:00 on clouddb1023.eqiad.wmnet with reason: Rebooting clouddb1023 T419960 |
[production] |
| 16:09 |
<fnegri@cumin1003> |
END (PASS) - Cookbook sre.hosts.remove-downtime (exit_code=0) for clouddb1025.eqiad.wmnet |
[production] |
| 16:09 |
<fnegri@cumin1003> |
START - Cookbook sre.hosts.remove-downtime for clouddb1025.eqiad.wmnet |
[production] |
| 16:09 |
<fnegri@cumin1003> |
conftool action : set/pooled=yes; selector: name=clouddb1025.eqiad.wmnet |
[production] |
| 16:04 |
<urandom> |
stopping aqs1010 for SSD replacement — T420867 |
[production] |
| 16:03 |
<elukey@cumin1003> |
END (PASS) - Cookbook sre.hosts.provision (exit_code=0) for host dse-k8s-worker1023.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 16:03 |
<eevans@cumin1003> |
DONE (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 8:00:00 on aqs1010.eqiad.wmnet with reason: Shutting down for SSD replacement — T420867 |
[production] |
| 15:59 |
<ejegg> |
civicrm upgraded from a2d4b17c to 675455b2 |
[fundraising] |
| 15:58 |
<fnegri@cumin1003> |
conftool action : set/pooled=no; selector: name=clouddb1025.eqiad.wmnet |
[production] |
| 15:57 |
<fnegri@cumin1003> |
DONE (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 1:00:00 on clouddb1025.eqiad.wmnet with reason: Rebooting clouddb1025 T419960 |
[production] |
| 15:57 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host dse-k8s-worker1023.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 15:56 |
<elukey@cumin1003> |
END (PASS) - Cookbook sre.hosts.provision (exit_code=0) for host dse-k8s-worker1022.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 15:53 |
<topranks> |
disabling puppet for nftables-enabled machines to validate new ruleset on selected hosts before wider rollout T420715 |
[production] |
| 15:50 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host dse-k8s-worker1022.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 15:50 |
<elukey@cumin1003> |
END (PASS) - Cookbook sre.hosts.provision (exit_code=0) for host dse-k8s-worker1021.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 15:49 |
<kevinbazira@deploy2002> |
helmfile [ml-serve-eqiad] Ran 'sync' command on namespace 'experimental' for release 'main' . |
[production] |
| 15:44 |
<elukey@cumin1003> |
START - Cookbook sre.hosts.provision for host dse-k8s-worker1021.mgmt.eqiad.wmnet with chassis set policy FORCE_RESTART |
[production] |
| 15:31 |
<btullis@cumin1003> |
START - Cookbook sre.hosts.reboot-single for host an-worker1172.eqiad.wmnet |
[production] |
| 15:28 |
<Lucas_WMDE> |
ssh integration-castor05.integration.eqiad1.wikimedia.cloud sudo -u jenkins-deploy rm -rf /srv/castor/castor-mw-ext-and-skins/master/mediawiki-node20 # fix failure seen in mediawiki-node20 90272 |
[releng] |
| 15:21 |
<cgoubert@deploy2002> |
helmfile [staging] DONE helmfile.d/services/rest-gateway: apply |
[production] |
| 15:20 |
<cgoubert@deploy2002> |
helmfile [staging] START helmfile.d/services/rest-gateway: apply |
[production] |
| 15:15 |
<fnegri@cumin1003> |
END (PASS) - Cookbook sre.hosts.remove-downtime (exit_code=0) for clouddb1020.eqiad.wmnet |
[production] |
| 15:14 |
<fnegri@cumin1003> |
START - Cookbook sre.hosts.remove-downtime for clouddb1020.eqiad.wmnet |
[production] |
| 15:14 |
<fnegri@cumin1003> |
conftool action : set/pooled=yes; selector: name=clouddb1020.eqiad.wmnet |
[production] |