1. Health Check-警告清除
#顯示crash
#root@pve1:/var/lib/ceph/crash/posted# ceph crash ls
ID ENTITY NEW
2022-11-20_04:19:15.455086Z_6ff0a228-afae-42fb-9fda-6ce7d34f7f94 mgr.pve1
2022-11-20_04:19:41.515229Z_96666130-253f-4017-ab21-93d9e1e211e8 mgr.pve1
2022-11-20_04:23:09.157816Z_08376718-72ea-4a2f-815f-97cb126624ad mgr.pve1
2022-11-20_06:05:56.922776Z_b6601eb5-24b3-401a-9eb8-bf5742d45e25 mon.pve1
2022-11-25_01:56:59.517294Z_84848254-1c98-4ab0-97d4-f9f1a6897cc4 mgr.pve4
2022-11-25_02:16:16.380455Z_e29c25b0-4bf5-419b-a6fc-1eb642b0a9dd mgr.pve2
2022-11-26_02:51:08.407370Z_764cb25f-5f08-4fa3-a60e-64f42d82f2a1 mgr.pve2
2022-11-26_02:51:30.773033Z_cda57001-91e0-40f9-97fe-649ad45935a6 mgr.pve2
#讀取crash資訊
ceph crash info <id>
#壓縮資訊,即可清除曆史的warn資訊
ceph crash archive <id>
或者:
ceph crash archive-all
2. MON_CLOCK_SKEW
ceph 對時間有着嚴格的要求,是以pve提供了對時間差的嚴格監控,相差大于0.05s就會有報警,需要做一次時間同步
ntpdate pool.ntp.org
hwclock -w
3. 删除osd
#首先通過ceph-volume lvm list 找到osd 與盤符的對應關系(有問題的盤)
root@pve1:~# ceph-volume lvm list
====== osd.1 =======
[block] /dev/ceph-7c7e8cc4-26fd-4cf8-b531-b94510ab63f8/osd-block-bc80a833-1221-426b-90c1-e910aeb3b0a1
block device /dev/ceph-7c7e8cc4-26fd-4cf8-b531-b94510ab63f8/osd-block-bc80a833-1221-426b-90c1-e910aeb3b0a1
block uuid sfTHK9-qTzt-dg00-7XN9-G4uE-2kzn-7f97Xs
cephx lockbox secret
cluster fsid 98d9e4a9-35d4-497f-96a0-60a62b9e9d64
cluster name ceph
crush device class None
db device /dev/sdg2
db uuid 1fbdcce9-88e3-4282-81cc-6ad255730be3
encrypted 0
osd fsid bc80a833-1221-426b-90c1-e910aeb3b0a1
osd id 7
type block
vdo 0
devices /dev/sdf
[db] /dev/sdg2
PARTUUID 1fbdcce9-88e3-4282-81cc-6ad255730be3
sdg 這塊盤有問題,得重新删除,并加進ceph
==================================================================
#a. 停止相應OSD服務
#systemctl stop [email protected]
#b. 取消OSD挂載
#umount /var/lib/ceph/osd/ceph-1
#c. 設定OSD為OUT
#ceph osd out osd.1
#d. 删除OSD
#ceph osd crush remove osd.1(如果未配置Crush Map則不需要執行這一行指令)
#ceph auth del osd.1
#ceph osd rm 1
#e. 清空已删除磁盤中的内容
wipefs -af /dev/sdb
#完成此項操作後,重新開機該OSD所在節點的機器
#重新開機完成後,需要zap該磁盤,輸入指令如下:
#ceph-volume lvm zap /dev/sdb
#如果zap成功,會提示以下資訊:
--> Zapping: /dev/sdb
Running command: /usr/sbin/cryptsetup status /dev/mapper/
stdout: /dev/mapper/ is inactive.
Running command: /usr/sbin/wipefs --all /dev/sdb
Running command: /bin/dd if=/dev/zero of=/dev/sdb bs=1M count=10
stderr: 10+0 records in
10+0 records out
stderr: 10485760 bytes (10 MB) copied, 0.0195807 s, 536 MB/s
--> Zapping successful for: /dev/sdb
#f. 重新安裝OSD
#ceph osd create /dev/sd[X] -db_dev /dev/sd[Y] (也可以在pve的ceph配置子產品建立osd)
#g. 檢視OSD TREE
ceph osd tree
4. 更新更新ceph版本14.2.4~14.2.20(解決ceph重新連接配接或者重建立用戶端連接配接的安全bug)
cat >> /etc/apt/sources.list <<EOF
deb http://download.proxmox.com/debian/pve buster pve-no-subscription
EOF
apt-get update
apt dist-upgrade -y
#更新完之後需要重新開機每個節點的mon、mgr、mds、osd
#這時候 ceph叢集會有兩個警告:
client is using insecure global_id reclaim
mons are allowing insecure global_id reclaim
#重新開機兩個服務(每台都要執行)
systemctl try-reload-or-restart pvestatd.service pvedaemon.service
#消除警告(ceph 任何一個節點執行即可)
ceph config set mon auth_allow_insecure_global_id_reclaim false
5. 從ceph中導出、導入鏡像
#查詢
rbd -p cephfs_data ls
rbd -p cephfs_data ls|grep base-1001-disk-0
# 導出
rbd export cephfs_data:vm-170-state-pz_2021_04_26 -p cephfs_data
root@pve1:/# rbd export base-1001-disk-0 -p cephfs_data base-1001-disk-0.img
#導入
qm importdisk ${vmid} ${imagename.img} ${ceph pool name}
# 若在目标pve節點沒有rbd指令 可以在一台有ceph的節點export,然後再去目标節點qm import
#比如:
在pve6 執行:rbd export base-1001-disk-0 -p cephfs_data base-1001-disk-0.img
在pve13 導入:qm importdisk 141 base-1001-disk-0.img ceph_data