RH342 - Default - 20190726 Book
RH342 - Default - 20190726 Book
RH342 - Default - 20190726 Book
1. Troubleshooting principles
2. Generic issues
# Persisting journal:
mkdir /var/log/journal
chown root:systemd-journal /var/log/journal
chmod 2755 /var/log/journal
killall -USR1 systemd-journal # or reboot
# journalctl examples:
journalctl -ef # end & follow
journalctl _SYSTEMD_UNIT=sshd.service # generated by sshd service
journalctl -u sshd.service # generated and about sshd service
journalctl -p emerg..err # priority between emergency and error
journalctl -b -1 # only from the last boot
journalctl --since "2019-01-01 20:30:00" --until "2019-02-02 12:00:00"
journalctl -o verbose # show all fields
# Using RH resources:
yum -y install sos
sosreport --help | less
sosreport -l | less # view currently enabled/disabled plugins and plugin options
sosreport -o <PLUGIN(S)> # enable these plugins only (it will only run these plugins)
sosreport -n <PLUGIN(S)> # skip these plugins (it will run all of the plugins, except for
sosreport -e <PLUGIN(S)> # enable previously disabled plugins
sosreport -k xfs.logprint # xfs module and logprint option enabled
redhat-support-tool
# Insights:
yum -y install redhat-access-insights
redhat-access-insights --register
# Cockpit:
yum -y install cockpit
systemctl start cockpit
firewall-cmd -add-service=cockpit --permanent
firewall-cmd --reload # http://localhost:9090
# Co-pilot:
yum -y install pcp # performance co-pilot
systemctl start pmcd # performance metrics collector daemon
systemctl enable pmcd
pmstat -s 5 # 5 samples
pmatop # machine stats and data
pminfo # obtain list of metrics
pminfo -dt proc.nprocs # understand specific metric
pmval -s 5 proc.nprocs # gather sample data about the metric 5x times
pmval -T 1minute kernel.percpu.cpu.idle # per-CPU idle time for one minute
# Historical data:
systemctl start pmlogger # ability to store metrics data to logs (-a <ARCHIVE>)
systemctl enable pmlogger
pcp | grep ’primary logger’ # location of the archive log that pmlogger is writing to
ls /var/log/pcp/pmlogger/<HOSTNAME> # collects data every second to this location
pmval -a <ARCHIVE.xz> -f 3 <METRIC> # performance metrics value dump from archive with 3 digits preci
pmval -a /var/log/pcp/pmlogger/serverX.example.com/20190101.00.10.0 kernel.all.load
pmval -a /var/log/pcp/pmlogger/serverX.example.com/20190101.00.10.0 kernel.all.load \
-S ’@ Tue Feb 01 12:00:00 2019’ -T ’@ Tue Feb 01 13:00:00 2019’
$template DynamicFile,"/var/log/loghost/%HOSTNAME%/cron.log"
cron.* ?DynamicFile # ’DyamicFile’ here is arbitrary template name
$template DynamicFile,"/var/log/loghost/%HOSTNAME%/%syslogfacility-text%.log"
*.* -?DynamicFile # minus is turn off syncing of the log file after each write
systemctl restart rsyslog
firewall-cmd --add-port=514/udp --permanent
firewall-cmd --add-port=514/tcp --permanent
firewall-cmd --reload
# Monitor changes:
yum -y install aide # intrusion detection
vim /etc/aide.conf
# Configuration lines:
PERMS = p+i+u+g+acl+selinux # file (p)ermissions, (i)node, (u)ser/(g)roup ownership, acl, sel
# Selection lines:
/dir1 PERMS # group check on dir1 and all files and dirs below it
=/dir2 PERMS # group check in dir2, but not recursively
!/dir3 # excludes dir3 and all files below it from any checks
# Macro lines:
@@define VAR value # @@{VAR} is reference to the macro defined previously
aide --init # creates /var/lib/aide/aide.db.new.gz every time
mv -v /var/lib/aide/aide.db.new.gz /var/lib/aide/aide.db.gz
aide --check
3. Boot issues
# Configuring Grub2:
vim /etc/default/grub
GRUB_TIMEOUT = seconds the menu is displayed
GRUB_DEFAULT = starts counting from 0, what is the default entry
GRUB_CMDLINE_LINUX = list of extra kernel params, e.g "rhgb quiet"
grub2-mkconfig -o /boot/grub2/grub.cfg # this is *.cfg and NOT *.conf
# Reinstalling Grub2 into the MBR, must reboot into rescue environment (e.g. anaconda):
chroot /mnt/sysimage
ls -l /boot
grub2-install /dev/vda # rewrite the boot loader sections of the MBR
Requires=
stopping listed unit will also stop this unit as well
RequiresOverridable=
failed requirements will not cause the unit to fail when explicitly started
Requisite=,RequisiteOverridable=
the unit will fail if required unit is not already running
After=
listed units have to have finished starting before this unit can be started
Before=
listed units will be delayed
Wants=
when wante unit fails to start, this unit itself will still start
Conflicts=
starting this unit will stop the conflicting units
systemctl daemon-reload # this is required after each change
systemctl list-dependencies <UNITNAME>
systemctl list-unit-files
systemctl status # shows tree of services and corresponding PIDs
4. Hardware issues
# Basic commands:
lscpu # identifying processor
cat /proc/cpuinfo # identifying what flags CPU supports
dmidecode -t memory # identifying memory
lsscsi -v # identifying disks
hdparm -I /dev/sda # more information about individual disks
lspci # identifying PCI hardware
lsusb # identifying USB hardware
# Memory failures:
# Older:
yum -y install mcelog # framework for catching and logging exceptions
# Newer:
yum -y install rasdaemon # modern replacement for mcelog
systemctl enable rasdaemon
systemctl start rasdaemon
ras-mc-ctl --status # what does subsystem know about memory
ras-mc-ctl --errors
# Test memory:
yum -y install memtest86+
memtest-setup # this adds new template to Grub2 (/etc/grub.d/)
grub2-mkconfig -o /boot/grub2/grub.cfg # update Grub2 config
# Overcommitting resources:
virsh nodecpustats
virsh nodememstats
virsh dommemstats <DOMAIN>
5. Storage issues
# XFS:
yum -y install xfsprogs
umount /dev/<DEVICE> # re-mount on systems where journal corruption suspected
xfs_repair -n /dev/<DEVICE> # perform only check
xfs_repair [-o force_geometry] /dev/<DEVICE> # perform all corrective actions, shows invalid inodes
mount /dev/<DEVICE> /mountpoint
ls /mountpoint/lost+found # unreferenced files
find /mountpoint -inum <NUMBER> # locate directory with the inode number
diff -s /file/from/backup /mountpoint/lost+found/<NUMBER>
# If corrupt journal log:
xfs_repair -L /dev/<DEVICE> # zeros out the journal log, potentially dangerous
# Recovering LVM:
# Config file:
vim /etc/lvm/lvm.conf
dir # scan for physical volumes (/dev)
obtain_device_list_from_udev # shoud udev be used (1)
preferred_names # which path name to display for block device
filter # which devices to scan for presence of PV signature
backup # save text-based metadata before each disk change (1)
backup_dir # where the backup of VG metadata should be stored
archive # should old configurations be also archived (1)
archive_dir # where the archives will be stored
retain_min # minimum number of archives to store
retain_days # minimum number of days for archive to be kept
# iSCSI initiator/client:
yum -y install iscsi-initiator-utils # ’systemctl enable iscsi --now’
iscsiadm -m node # see already discovered targets/node records
iscsiadm -m session [-P 3] # validate sessions or connections, P=print level
vim /etc/iscsi/iscsid.conf # restart iscsi/iscsid every time you change this file
discovery.sendtargets.auth.<authmetod|username|password|username_in|password_in>
node.session.auth.<authmetod|username|password|username_in|password_in>
vim /etc/iscsi/initiatorname.iscsi # this needs iscsid restart
InitiatorName=iqn.2016-01.com.example.lab:servera
systemctl restart iscsid
iscsiadm -m discovery -t st -p <TARGET>:<PORT> # discovery & sendtargets for portal -> /var/lib/iscsi/nodes
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage --login [-d8] # -d8=debug
# Disable CHAP authentication:
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage -o update -n node.session.auth.authmethod \
-v None [-p <TARGET>:<PORT>] # o=overwrite previous config,n=name,v=value
# Purge all node information from cache, recommended when server’s setting change:
iscsiadm -m node -T iqn.2016-01.com.example.lab:iscsistorage -o delete [-p <TARGET>:<PORT>]
# Purge all know nodes from cache:
iscsiadm -m node -o delete [-p <TARGET>:<PORT>] # default port 3260/tcp
lsblk --scsi
6. RPM issues
7. Network issues
ping -c 1 -W 3 <IPv4> # send single echo request and wait 3s for reply
ping6 [-I <INTERFACE>] <IPv6> # -I is not needed when routable IPv6 is used
# Troubleshooting:
ip addr show dev <DEVICE_NAME>
ip route
nmcli con # display connection information
nmcli dev # display device information
nmcli conn show ’<CONNECTION_NAME>’ | grep ipv # all config settings
ipv4.method # auto=dhcp, manual=static (needs addresses,gateway)
ipv6.method
ncmli conn mod ’<CONNECTION_NAME>’ ipv4.dns ’<IPv4>’ # good alternative: ’nmtui’, restart affected services
nmcli conn reload # after you manually edit network-scripts
nmcli conn down ’<CONNECTION_NAME>’ ; nmcli \ # changes are not applied to already active interface...
conn up ’<CONNECTION_NAME>’ # ...also updates /etc/resolv.conf
firewall-cmd --list-all-zones [--permanent] # comparing active and permanent can identify problems
firewall-cmd --runtime-to-permanent # quick convert of runtime rules to permanent
host -v -t aaaa <HOSTNAME> <DNS> # query DNS for hostname’s IPv6
8. Application issues
9. Security issues
# SELinux logging (or /var/log/audit/audit.log):
ausearch -m avc -ts recent # display Access Vector Control messages, last 10mins
# Troubleshooting SELinux:
yum -y install setroubleshoot-server # provides ’sealert’, ’sedispatch’ plugin for auditd
service auditd restart # auditd’s sedispatch plugin requires restart, don’t use systemd
sealert -l <UUID_OF_DENIAL>
sealert -a /var/log/audit/audit.log # parse all denial messages out of file
# Common issues:
semanage fcontext -a -t <TYPE> ’<PATH>(/.*)?’ # add path to the list of standard file contexts
restorecon -Rv <PATH> # apply the new file contexts
touch /.autorelabel # perform automatic relabel of all files after disabled->enforcin
semanage boolean --list # current/default state + description of all SELinux toggles
setsebool -P <BOOLEAN=ON/OFF> # boolean values updated permanently, without -P only in memory
semanage port -a -t <TYPE> -p tcp <PORT> # label an unlabeled port (e.g. http_port_t on port 8001 etc.)
# vsftpd public directory, where anonymous are allowed to write:
semanage fcontext -a -t public_content_rw_t ’/var/ftp/pub(/.*)?’
setsebool -P allow_ftpd_anon_write=1
# Troubleshooting PAM:
tail -f /var/log/secure
journalctl -u <PROBLEMATIC_SERVICE> # good start for logging issues: ’journalctl _COMM=login’
rpm -V <PROBLEMATIC_SERVICE> # did the files belonging to service change, especially PAM confi
mv /etc/pam.d/<PROBLEMATIC_FILE>{,.broken} # rename broken PAM config, otherwise reinstall will not touch it
yum reinstall <PROBLEMATIC_SERVICE>
diff -u /etc/pam.d/<PROBLEMATIC_FILE>{,.broken} # compare good and bad PAM config of problematic service
authconfig
authconfig-tui
authconfig-gtk
authconfig --updateall # recreate all configuration files and re-apply the configuration
yum -y install pam_krb5 # when the module for Kerberos is missing
# Solving LDAP issues:
yum -y install openldap-clients # set of tools
cat /etc/openldap/ldap.conf # LDAP defaults (BASE,URI etc.), usually port TCP/389 with STARTT
mv <CRT> /etc/openldap/cacerts;cacertdir_rehash # when CAs mismatches are happening, this needs to be done
ldapsearch -x -ZZ -LL ’(uid=ldapsuer)’ \
cn homeDirectory # -x simple auth, -ZZ enforce TLS, -LL disable comments in output
getent passwd <LDAP_USER> # uses nsswitch.conf to query backend password systems
# Solving Kerberos issues:
kinit <USER> # obtain TGT (ticket granting ticket), time must match everywhere
cat /etc/krb5.conf | grep -A 1 domain_realm # is the [domain_realm] section correct in Kerberos5 config?
yum -y install sssd-common
man sssd-krb5 # when System Security Services Daemon is used (krb5_server,krb5_
/etc/sssd/sssd.conf # this cache may contain KRB5 as well. Change needs restart of ss
yum -y install krb5-workstation
klist # check if the user received TGT
klist -ek /etc/krb5.keytab # inspect keytabs, KVNO shows version of the password stored. Whe
cat /etc/exports.d/* /etc/auto.guests /etc/fstab # sec=krb5i vs. sec=krb5p = they must match everywhere, needs au
ssh -o PreferredAuthentications=keyboard-interactive,password ldapuser@server # when testing LDAP instead of SSH
# Kernel crash dump triggers: # ’sysctl -a | grep -e kernel -e vm’ is helpful if you don’t reme
echo "vm.panic_on_oom=1" >> /etc/sysctl.conf # panic on OOM-killer events permanently
echo "kernel.hung_task_panic=1" >> /etc/sysctl.conf # panic on hung process perm
cat /proc/sys/kernel/hung_task_timeout_secs # hung task timeout
echo "kernel.softlockup_panic=1" >> /etc/sysctl.conf # soft lockups (kernel loops in kernel mode) perm
echo "kernel.panic_on_io_nmi=1" >> /etc/sysctl.conf # nonrecoverable HW failure (NMI) perm
echo "kernel.sysrq=1" >> /etc/sysctl.conf # enable all magic sysrq (key sequence in case of unresponsive sy
echo "c" > /proc/sysrq-trigger # initiate a system crash (other sysrq keys: m,t,p,c,s,u,b,9,f,w)
sysctl -p # load in Kernel parameters
# For only the SystemTap runtime environment you need a single package:
yum -y install systemtap-runtime
man -P ’less +/PERMISSIONS’ stap # see the PERMISSIONS section of the stap manpage for all the det
# User in "stapdev" & "stapusr" groups can run the module from anywhere (do this on the destination machine):
usermod -aG stapusr <USER> # can run SystemTap modules, but only if they exist in the /lib/m
usermod -aG stapdev <USER> # may compile their own SystemTap instrumentation kernel modules