RH342 - Default - 20190726 Book
RH342 - Default - 20190726 Book
RH342 - Default - 20190726 Book
1. Troubleshooting principles
2. Generic issues
# Persisting journal:
mkdir /var/log/journal
chown root:systemd-journal /var/log/journal
chmod 2755 /var/log/journal
killall -USR1 systemd-journal # or reboot
# journalctl examples:
journalctl -ef # end & follow
journalctl _SYSTEMD_UNIT=sshd.service # generated by sshd service
journalctl -u sshd.service # generated and about sshd service
journalctl -p emerg..err # priority between emergency and error
journalctl -b -1 # only from the last boot
journalctl --since "2019-01-01 20:30:00" --until "2019-02-02 12:00:00"
journalctl -o verbose # show all fields
# Using RH resources:
yum -y install sos
sosreport --help | less
sosreport -l | less # view currently enabled/disabled plugins and plugin options
sosreport -o <PLUGIN(S)> # enable these plugins only (it will only run these plugins)
sosreport -n <PLUGIN(S)> # skip these plugins (it will run all of the plugins, except for
sosreport -e <PLUGIN(S)> # enable previously disabled plugins
sosreport -k xfs.logprint # xfs module and logprint option enabled
# Insights:
yum -y install redhat-access-insights
redhat-access-insights --register
# Cockpit:
yum -y install cockpit
systemctl start cockpit
firewall-cmd -add-service=cockpit --permanent
firewall-cmd --reload # http://localhost:9090
# Co-pilot:
yum -y install pcp # performance co-pilot
systemctl start pmcd # performance metrics collector daemon
systemctl enable pmcd
pmstat -s 5 # 5 samples
pmatop # machine stats and data
pminfo # obtain list of metrics
pminfo -dt proc.nprocs # understand specific metric
pmval -s 5 proc.nprocs # gather sample data about the metric 5x times
pmval -T 1minute kernel.percpu.cpu.idle # per-CPU idle time for one minute
# Historical data:
systemctl start pmlogger # ability to store metrics data to logs (-a <ARCHIVE>)
systemctl enable pmlogger
pcp | grep ’primary logger’ # location of the archive log that pmlogger is writing to
ls /var/log/pcp/pmlogger/<HOSTNAME> # collects data every second to this location
pmval -a <ARCHIVE.xz> -f 3 <METRIC> # performance metrics value dump from archive with 3 digits preci
pmval -a /var/log/pcp/pmlogger/serverX.example.com/20190101.00.10.0 kernel.all.load
pmval -a /var/log/pcp/pmlogger/serverX.example.com/20190101.00.10.0 kernel.all.load \
-S ’@ Tue Feb 01 12:00:00 2019’ -T ’@ Tue Feb 01 13:00:00 2019’
$template DynamicFile,"/var/log/loghost/%HOSTNAME%/cron.log"
cron.* ?DynamicFile # ’DyamicFile’ here is arbitrary template name
$template DynamicFile,"/var/log/loghost/%HOSTNAME%/%syslogfacility-text%.log"
*.* -?DynamicFile # minus is turn off syncing of the log file after each write
systemctl restart rsyslog
firewall-cmd --add-port=514/udp --permanent
firewall-cmd --add-port=514/tcp --permanent
firewall-cmd --reload
# Monitor changes:
yum -y install aide # intrusion detection
vim /etc/aide.conf
# Configuration lines:
PERMS = p+i+u+g+acl+selinux # file (p)ermissions, (i)node, (u)ser/(g)roup ownership, acl, sel
# Selection lines:
/dir1 PERMS # group check on dir1 and all files and dirs below it
=/dir2 PERMS # group check in dir2, but not recursively
!/dir3 # excludes dir3 and all files below it from any checks
# Macro lines:
@@define VAR value # @@{VAR} is reference to the macro defined previously
aide --init # creates /var/lib/aide/aide.db.new.gz every time
mv -v /var/lib/aide/aide.db.new.gz /var/lib/aide/aide.db.gz
aide --check
3. Boot issues
# Configuring Grub2:
vim /etc/default/grub
GRUB_TIMEOUT = seconds the menu is displayed
GRUB_DEFAULT = starts counting from 0, what is the default entry
GRUB_CMDLINE_LINUX = list of extra kernel params, e.g "rhgb quiet"
grub2-mkconfig -o /boot/grub2/grub.cfg # this is *.cfg and NOT *.conf
# Reinstalling Grub2 into the MBR, must reboot into rescue environment (e.g. anaconda):
chroot /mnt/sysimage
ls -l /boot
grub2-install /dev/vda # rewrite the boot loader sections of the MBR
stopping listed unit will also stop this unit as well
failed requirements will not cause the unit to fail when explicitly started
the unit will fail if required unit is not already running
listed units have to have finished starting before this unit can be started
listed units will be delayed
when wante unit fails to start, this unit itself will still start
starting this unit will stop the conflicting units
systemctl daemon-reload # this is required after each change
systemctl list-dependencies <UNITNAME>
systemctl list-unit-files
systemctl status # shows tree of services and corresponding PIDs
4. Hardware issues
# Basic commands:
lscpu # identifying processor
cat /proc/cpuinfo # identifying what flags CPU supports
dmidecode -t memory # identifying memory
lsscsi -v # identifying disks
hdparm -I /dev/sda # more information about individual disks
lspci # identifying PCI hardware
lsusb # identifying USB hardware
# Memory failures:
# Older:
yum -y install mcelog # framework for catching and logging exceptions
# Newer:
yum -y install rasdaemon # modern replacement for mcelog
systemctl enable rasdaemon
systemctl start rasdaemon
ras-mc-ctl --status # what does subsystem know about memory
ras-mc-ctl --errors
# Test memory:
yum -y install memtest86+
memtest-setup # this adds new template to Grub2 (/etc/grub.d/)
grub2-mkconfig -o /boot/grub2/grub.cfg # update Grub2 config
# Overcommitting resources:
virsh nodecpustats
virsh nodememstats
virsh dommemstats <DOMAIN>
5. Storage issues
# XFS:
yum -y install xfsprogs
umount /dev/<DEVICE> # re-mount on systems where journal corruption suspected
xfs_repair -n /dev/<DEVICE> # perform only check
xfs_repair [-o force_geometry] /dev/<DEVICE> # perform all corrective actions, shows invalid inodes
mount /dev/<DEVICE> /mountpoint
ls /mountpoint/lost+found # unreferenced files
find /mountpoint -inum <NUMBER> # locate directory with the inode number
diff -s /file/from/backup /mountpoint/lost+found/<NUMBER>
# If corrupt journal log:
xfs_repair -L /dev/<DEVICE> # zeros out the journal log, potentially dangerous
# Recovering LVM:
# Config file:
vim /etc/lvm/lvm.conf
dir # scan for physical volumes (/dev)
obtain_device_list_from_udev # shoud udev be used (1)
preferred_names # which path name to display for block device
filter # which devices to scan for presence of PV signature
backup # save text-based metadata before each disk change (1)
backup_dir # where the backup of VG metadata should be stored
archive # should old configurations be also archived (1)
archive_dir # where the archives will be stored
retain_min # minimum number of archives to store
retain_days # minimum number of days for archive to be kept
# iSCSI initiator/client:
yum -y install iscsi-initiator-utils # ’systemctl enable iscsi --now’
iscsiadm -m node # see already discovered targets/node records
iscsiadm -m session [-P 3] # validate sessions or connections, P=print level
vim /etc/iscsi/iscsid.conf # restart iscsi/iscsid every time you change this file
vim /etc/iscsi/initiatorname.iscsi # this needs iscsid restart
systemctl restart iscsid
iscsiadm -m discovery -t st -p <TARGET>:<PORT> # discovery & sendtargets for portal -> /var/lib/iscsi/nodes
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage --login [-d8] # -d8=debug
# Disable CHAP authentication:
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage -o update -n node.session.auth.authmethod \
-v None [-p <TARGET>:<PORT>] # o=overwrite previous config,n=name,v=value
# Purge all node information from cache, recommended when server’s setting change:
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage -o delete [-p <TARGET>:<PORT>]
# Purge all know nodes from cache:
iscsiadm -m node -o delete [-p <TARGET>:<PORT>] # default port 3260/tcp
lsblk --scsi
6. RPM issues
7. Network issues
ping -c 1 -W 3 <IPv4> # send single echo request and wait 3s for reply
ping6 [-I <INTERFACE>] <IPv6> # -I is not needed when routable IPv6 is used
# Troubleshooting:
ip addr show dev <DEVICE_NAME>
ip route
nmcli con # display connection information
nmcli dev # display device information
nmcli conn show ’<CONNECTION_NAME>’ | grep ipv # all config settings
ipv4.method # auto=dhcp, manual=static (needs addresses,gateway)
ncmli conn mod ’<CONNECTION_NAME>’ ipv4.dns ’<IPv4>’ # good alternative: ’nmtui’, restart affected services
nmcli conn reload # after you manually edit network-scripts
nmcli conn down ’<CONNECTION_NAME>’ ; nmcli \ # changes are not applied to already active interface...
conn up ’<CONNECTION_NAME>’ # ...also updates /etc/resolv.conf
firewall-cmd --list-all-zones [--permanent] # comparing active and permanent can identify problems
firewall-cmd --runtime-to-permanent # quick convert of runtime rules to permanent
host -v -t aaaa <HOSTNAME> <DNS> # query DNS for hostname’s IPv6
8. Application issues
9. Security issues
# SELinux logging (or /var/log/audit/audit.log):
ausearch -m avc -ts recent # display Access Vector Control messages, last 10mins
# Troubleshooting SELinux:
yum -y install setroubleshoot-server # provides ’sealert’, ’sedispatch’ plugin for auditd
service auditd restart # auditd’s sedispatch plugin requires restart, don’t use systemd
sealert -l <UUID_OF_DENIAL>
sealert -a /var/log/audit/audit.log # parse all denial messages out of file
# Common issues:
semanage fcontext -a -t <TYPE> ’<PATH>(/.*)?’ # add path to the list of standard file contexts
restorecon -Rv <PATH> # apply the new file contexts
touch /.autorelabel # perform automatic relabel of all files after disabled->enforcin
semanage boolean --list # current/default state + description of all SELinux toggles
setsebool -P <BOOLEAN=ON/OFF> # boolean values updated permanently, without -P only in memory
semanage port -a -t <TYPE> -p tcp <PORT> # label an unlabeled port (e.g. http_port_t on port 8001 etc.)
# vsftpd public directory, where anonymous are allowed to write:
semanage fcontext -a -t public_content_rw_t ’/var/ftp/pub(/.*)?’
setsebool -P allow_ftpd_anon_write=1
# Troubleshooting PAM:
tail -f /var/log/secure
journalctl -u <PROBLEMATIC_SERVICE> # good start for logging issues: ’journalctl _COMM=login’
rpm -V <PROBLEMATIC_SERVICE> # did the files belonging to service change, especially PAM confi
mv /etc/pam.d/<PROBLEMATIC_FILE>{,.broken} # rename broken PAM config, otherwise reinstall will not touch it
diff -u /etc/pam.d/<PROBLEMATIC_FILE>{,.broken} # compare good and bad PAM config of problematic service
authconfig --updateall # recreate all configuration files and re-apply the configuration
yum -y install pam_krb5 # when the module for Kerberos is missing
# Solving LDAP issues:
yum -y install openldap-clients # set of tools
cat /etc/openldap/ldap.conf # LDAP defaults (BASE,URI etc.), usually port TCP/389 with STARTT
mv <CRT> /etc/openldap/cacerts;cacertdir_rehash # when CAs mismatches are happening, this needs to be done
ldapsearch -x -ZZ -LL ’(uid=ldapsuer)’ \
cn homeDirectory # -x simple auth, -ZZ enforce TLS, -LL disable comments in output
getent passwd <LDAP_USER> # uses nsswitch.conf to query backend password systems
# Solving Kerberos issues:
kinit <USER> # obtain TGT (ticket granting ticket), time must match everywhere
cat /etc/krb5.conf | grep -A 1 domain_realm # is the [domain_realm] section correct in Kerberos5 config?
yum -y install sssd-common
man sssd-krb5 # when System Security Services Daemon is used (krb5_server,krb5_
/etc/sssd/sssd.conf # this cache may contain KRB5 as well. Change needs restart of ss
yum -y install krb5-workstation
klist # check if the user received TGT
klist -ek /etc/krb5.keytab # inspect keytabs, KVNO shows version of the password stored. Whe
cat /etc/exports.d/* /etc/auto.guests /etc/fstab # sec=krb5i vs. sec=krb5p = they must match everywhere, needs au
ssh -o PreferredAuthentications=keyboard-interactive,password ldapuser@server # when testing LDAP instead of SSH
# Kernel crash dump triggers: # ’sysctl -a | grep -e kernel -e vm’ is helpful if you don’t reme
echo "vm.panic_on_oom=1" >> /etc/sysctl.conf # panic on OOM-killer events permanently
echo "kernel.hung_task_panic=1" >> /etc/sysctl.conf # panic on hung process perm
cat /proc/sys/kernel/hung_task_timeout_secs # hung task timeout
echo "kernel.softlockup_panic=1" >> /etc/sysctl.conf # soft lockups (kernel loops in kernel mode) perm
echo "kernel.panic_on_io_nmi=1" >> /etc/sysctl.conf # nonrecoverable HW failure (NMI) perm
echo "kernel.sysrq=1" >> /etc/sysctl.conf # enable all magic sysrq (key sequence in case of unresponsive sy
echo "c" > /proc/sysrq-trigger # initiate a system crash (other sysrq keys: m,t,p,c,s,u,b,9,f,w)
sysctl -p # load in Kernel parameters
# For only the SystemTap runtime environment you need a single package:
yum -y install systemtap-runtime
man -P ’less +/PERMISSIONS’ stap # see the PERMISSIONS section of the stap manpage for all the det
# User in "stapdev" & "stapusr" groups can run the module from anywhere (do this on the destination machine):
usermod -aG stapusr <USER> # can run SystemTap modules, but only if they exist in the /lib/m
usermod -aG stapdev <USER> # may compile their own SystemTap instrumentation kernel modules