2021-07-20
§
|
20:30 |
<joal> |
rerun webrequest timed-out instances |
[analytics] |
18:58 |
<mforns> |
starting refinery deployment |
[analytics] |
18:40 |
<razzi> |
razzi@an-launcher1002:~$ sudo puppet agent --enable |
[analytics] |
18:39 |
<razzi> |
razzi@an-master1001:/var/log/hadoop-hdfs$ sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues |
[analytics] |
18:37 |
<razzi> |
razzi@an-master1002:~$ sudo -i puppet agent --enable |
[analytics] |
18:34 |
<razzi> |
razzi@an-master1002:~$ sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues |
[analytics] |
18:32 |
<razzi> |
razzi@an-master1002:~$ sudo systemctl start hadoop-yarn-resourcemanager.service |
[analytics] |
18:31 |
<razzi> |
razzi@an-master1002:~$ sudo systemctl stop hadoop-yarn-resourcemanager.service |
[analytics] |
18:22 |
<razzi> |
sudo -u hdfs /usr/bin/hdfs haadmin -failover an-master1002-eqiad-wmnet an-master1001-eqiad-wmnet |
[analytics] |
18:21 |
<razzi> |
re-enable yarn queues by merging puppet patch https://gerrit.wikimedia.org/r/c/operations/puppet/+/705732 |
[analytics] |
17:27 |
<razzi> |
razzi@cumin1001:~$ sudo -i wmf-auto-reimage-host -p T278423 an-master1001.eqiad.wmnet |
[analytics] |
17:17 |
<razzi> |
stop all hadoop processes on an-master1001 |
[analytics] |
16:52 |
<razzi> |
starting hadoop processes on an-master1001 since they didn't failover cleanly |
[analytics] |
16:31 |
<razzi> |
sudo bash gid_script.bash on an-maseter1001 |
[analytics] |
16:29 |
<razzi> |
razzi@alert1001:~$ sudo icinga-downtime -h an-master1001 -d 7200 -r "an-master1001 debian upgrade" |
[analytics] |
16:25 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-mapreduce-historyserver |
[analytics] |
16:25 |
<razzi> |
sudo systemctl stop hadoop-hdfs-zkfc.service on an-master1001 again |
[analytics] |
16:25 |
<razzi> |
sudo systemctl stop hadoop-yarn-resourcemanager on an-master1001 again |
[analytics] |
16:23 |
<razzi> |
sudo systemctl stop hadoop-hdfs-namenode on an-master1001 |
[analytics] |
16:19 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-hdfs-zkfc |
[analytics] |
16:19 |
<razzi> |
razzi@an-master1001:~$ sudo systemctl stop hadoop-yarn-resourcemanager |
[analytics] |
16:18 |
<razzi> |
sudo systemctl stop hadoop-hdfs-namenode |
[analytics] |
16:10 |
<razzi> |
razzi@cumin1001:~$ sudo transfer.py an-master1002.eqiad.wmnet:/home/razzi/hdfs-namenode-snapshot-buster-reimage-$(date --iso-8601).tar.gz stat1004.eqiad.wmnet:/home/razzi/hdfs-namenode-fsimage |
[analytics] |
16:03 |
<razzi> |
root@an-master1002:/srv/hadoop/name# tar -czf /home/razzi/hdfs-namenode-snapshot-buster-reimage-$(date --iso-8601).tar.gz current |
[analytics] |
15:57 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -saveNamespace |
[analytics] |
15:52 |
<razzi> |
sudo -u hdfs kerberos-run-command hdfs hdfs dfsadmin -safemode enter |
[analytics] |
15:37 |
<razzi> |
kill yarn applications: for jobId in $(yarn application -list | awk 'NR > 2 { print $1 }'); do yarn application -kill $jobId; done |
[analytics] |
15:08 |
<razzi> |
sudo -u yarn kerberos-run-command yarn yarn rmadmin -refreshQueues |
[analytics] |
14:52 |
<razzi> |
sudo systemctl stop 'gobblin-*.timer' |
[analytics] |
14:51 |
<razzi> |
sudo systemctl stop analytics-reportupdater-logs-rsync.timer |
[analytics] |
14:47 |
<razzi> |
Disable jobs on an-launcher1002 (see https://phabricator.wikimedia.org/T278423#7190372) |
[analytics] |
14:46 |
<razzi> |
razzi@an-launcher1002:~$ sudo puppet agent --disable 'razzi: upgrade hadoop masters to debian buster' |
[analytics] |
08:32 |
<mforns> |
restarted webrequest bundle (messed up a coord when trying to rerun some failed hours) |
[analytics] |