2021-07-20
ยง
|
17:57 |
<razzi@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 2:00:00 on an-master1001.eqiad.wmnet with reason: REIMAGE |
[production] |
17:55 |
<razzi@cumin1001> |
START - Cookbook sre.hosts.downtime for 2:00:00 on an-master1001.eqiad.wmnet with reason: REIMAGE |
[production] |
17:45 |
<arturo> |
pushed new toolforge-jobs-framework-api docker image into the registry (3a6ae38d51202c5c765c8d800cb8380e2a20b998) (T286126 |
[tools] |
17:37 |
<arturo> |
added toolforge-jobs-framework-cli v3 to aptly buster-tools and buster-toolsbeta |
[tools] |
17:27 |
<razzi> |
razzi@cumin1001:~$ sudo -i wmf-auto-reimage-host -p T278423 an-master1001.eqiad.wmnet |
[analytics] |
17:17 |
<razzi> |
stop all hadoop processes on an-master1001 |
[analytics] |
17:07 |
<andrewbogott> |
reloading haproxy on dbproxy1018 for T286598 |
[admin] |
17:06 |
<rzl> |
enabled puppet on A:mw |
[production] |
16:54 |
<btullis@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 1:00:00 on 20 hosts with reason: dealing with an-master1001 rebuild issue |
[production] |
16:54 |
<btullis@cumin1001> |
START - Cookbook sre.hosts.downtime for 1:00:00 on 20 hosts with reason: dealing with an-master1001 rebuild issue |
[production] |
16:53 |
<rzl> |
disabled puppet on A:mw to test https://gerrit.wikimedia.org/r/676508 |
[production] |
16:53 |
<btullis@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 1:00:00 on 64 hosts with reason: dealing with an-master1001 rebuild issue |
[production] |
16:53 |
<btullis@cumin1001> |
START - Cookbook sre.hosts.downtime for 1:00:00 on 64 hosts with reason: dealing with an-master1001 rebuild issue |
[production] |
16:52 |
<razzi> |
starting hadoop processes on an-master1001 since they didn't failover cleanly |
[analytics] |
16:44 |
<dcausse@deploy1002> |
helmfile [staging] Ran 'sync' command on namespace 'rdf-streaming-updater' for release 'main' . |
[production] |
16:37 |
<dzahn@cumin1001> |
END (PASS) - Cookbook sre.hosts.decommission (exit_code=0) for hosts mw1297.eqiad.wmnet |
[production] |
16:31 |
<razzi> |
sudo bash gid_script.bash on an-maseter1001 |
[analytics] |
16:29 |
<razzi> |
razzi@alert1001:~$ sudo icinga-downtime -h an-master1001 -d 7200 -r "an-master1001 debian upgrade" |
[analytics] |
16:25 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-mapreduce-historyserver |
[analytics] |
16:25 |
<razzi> |
sudo systemctl stop hadoop-hdfs-zkfc.service on an-master1001 again |
[analytics] |
16:25 |
<dcausse@deploy1002> |
helmfile [staging] Ran 'sync' command on namespace 'rdf-streaming-updater' for release 'main' . |
[production] |
16:25 |
<razzi> |
sudo systemctl stop hadoop-yarn-resourcemanager on an-master1001 again |
[analytics] |
16:24 |
<dzahn@cumin1001> |
START - Cookbook sre.hosts.decommission for hosts mw1297.eqiad.wmnet |
[production] |
16:23 |
<razzi> |
sudo systemctl stop hadoop-hdfs-namenode on an-master1001 |
[analytics] |
16:21 |
<dzahn@cumin1001> |
END (PASS) - Cookbook sre.hosts.decommission (exit_code=0) for hosts mw1290.eqiad.wmnet |
[production] |
16:19 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-hdfs-zkfc |
[analytics] |
16:19 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-yarn-resourcemanager |
[analytics] |
16:18 |
<razzi> |
sudo systemctl stop hadoop-hdfs-namenode |
[analytics] |
16:11 |
<dzahn@cumin1001> |
START - Cookbook sre.hosts.decommission for hosts mw1290.eqiad.wmnet |
[production] |
16:10 |
<razzi> |
razzi@cumin1001:~$ sudo transfer.py an-master1002.eqiad.wmnet:/home/razzi/hdfs-namenode-snapshot-buster-reimage-$(date --iso-8601).tar.gz stat1004.eqiad.wmnet:/home/razzi/hdfs-namenode-fsimage |
[analytics] |
16:10 |
<dzahn@cumin1001> |
END (PASS) - Cookbook sre.hosts.decommission (exit_code=0) for hosts mw1289.eqiad.wmnet |
[production] |
16:03 |
<razzi> |
root@an-master1002:/srv/hadoop/name# tar -czf /home/razzi/hdfs-namenode-snapshot-buster-reimage-$(date --iso-8601).tar.gz current |
[analytics] |
15:59 |
<dzahn@cumin1001> |
START - Cookbook sre.hosts.decommission for hosts mw1289.eqiad.wmnet |
[production] |
15:57 |
<dzahn@cumin1001> |
conftool action : set/pooled=inactive; selector: name=mw129[07].eqiad.wmnet |
[production] |
15:57 |
<dzahn@cumin1001> |
conftool action : set/pooled=inactive; selector: name=mw1289.eqiad.wmnet |
[production] |
15:57 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -saveNamespace |
[analytics] |
15:52 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -safemode enter |
[analytics] |
15:48 |
<oblivian@deploy1002> |
helmfile [eqiad] Ran 'sync' command on namespace 'mwdebug' for release 'pinkunicorn' . |
[production] |
15:45 |
<arturo> |
failback from labstore1006 to labstore1007 (dumps NFS) https://gerrit.wikimedia.org/r/c/operations/puppet/+/705417 |
[admin] |
15:37 |
<razzi> |
kill yarn applications: for jobId in $(yarn application -list | awk 'NR > 2 { print $1 }'); do yarn application -kill $jobId; done |
[analytics] |
15:23 |
<vgutierrez> |
pool dns1002 - T286069 |
[production] |
15:21 |
<vgutierrez> |
pool cp[1087-1090].eqiad.wmnet - T286069 |
[production] |
15:19 |
<jmm@puppetmaster1001> |
conftool action : set/pooled=yes; selector: name=ldap-replica1004.wikimedia.org |
[production] |
15:17 |
<wm-bot> |
<bd808> Restarting becuase the bot is not working on all channels. Logs are inconclusive as to why. |
[tools.bridgebot] |
15:14 |
<dzahn@cumin1001> |
conftool action : set/pooled=no; selector: name=mw1297.eqiad.wmnet |
[production] |
15:14 |
<dzahn@cumin1001> |
conftool action : set/pooled=no; selector: name=mw1290.eqiad.wmnet |
[production] |
15:14 |
<dzahn@cumin1001> |
conftool action : set/pooled=no; selector: name=mw1289.eqiad.wmnet |
[production] |
15:08 |
<razzi> |
sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues |
[analytics] |
15:06 |
<kormat@cumin1001> |
END (PASS) - Cookbook sre.hosts.downtime (exit_code=0) for 4:00:00 on 12 hosts with reason: Deploying schema change to s3 T281058 |
[production] |
15:06 |
<kormat@cumin1001> |
START - Cookbook sre.hosts.downtime for 4:00:00 on 12 hosts with reason: Deploying schema change to s3 T281058 |
[production] |