Debian EFI cold boot fails, msdos vs gpt partition confusion

I’m running into the following problem:

  • mender-convert image with debian 11 with grub
  • Dell workstations (2022), only efi boot, nvme hardisks
  • The image boots from the harddisk if the unspecific harddisk boot entry is selected:
Boot0000* UEFI PM9A1 NVMe Samsung 512GB S6H3NX0T623062 1	HD(1,GPT,cada89bc-aee6-4593-accf-e9c908fb3800,0x4000,0x80000)/File(\EFI\Boot\BootX64.efi)N.....YM....R,Y.
  • The image does not boot if the corresponding debian boot entry is selected:
Boot000B* debian	HD(1,GPT,cada89bc-aee6-4593-accf-e9c908fb3800,0x4000,0x80000)/File(\EFI\debian\shimx64.efi)
  • Error message in grub (?):
Loading Linux 5.10.0-18-amd64 ...
error: disk 'hd0,msdos3' not found.

Has anybody ever seen something like that?

efibootmgr -v:

BootCurrent: 000B
Timeout: 0 seconds
BootOrder: 000B,0007,0000,0003,0006,0009,0001,0002,0004,0005,0008,000A
Boot0000* UEFI PM9A1 NVMe Samsung 512GB S6H3NX0T623062 1	HD(1,GPT,cada89bc-aee6-4593-accf-e9c908fb3800,0x4000,0x80000)/File(\EFI\Boot\BootX64.efi)N.....YM....R,Y.
Boot0001* ONBOARD NIC (IPV4)	PciRoot(0x0)/Pci(0x1f,0x6)/MAC(00be439542ad,0)/IPv4(0.0.0.00.0.0.0,0,0)N.....YM....R,Y.
Boot0002* ONBOARD NIC (IPV6)	PciRoot(0x0)/Pci(0x1f,0x6)/MAC(00be439542ad,0)/IPv6([::]:<->[::]:,0,0)N.....YM....R,Y.
Boot0003* UEFI HTTPs Boot	PciRoot(0x0)/Pci(0x1f,0x6)/MAC(00be439542ad,0)/IPv4(0.0.0.00.0.0.0,0,0)/Uri()N.....YM....R,Y.
Boot0004* UEFI PXEv4 (MAC:507C6F1AC890)	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x0)/MAC(507c6f1ac890,1)/IPv4(0.0.0.00.0.0.0,0,0)N.....YM....R,Y.
Boot0005* UEFI PXEv6 (MAC:507C6F1AC890)	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x0)/MAC(507c6f1ac890,1)/IPv6([::]:<->[::]:,0,0)N.....YM....R,Y.
Boot0006* UEFI HTTPs Boot 2	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x0)/MAC(507c6f1ac890,1)/IPv4(0.0.0.00.0.0.0,0,0)/Uri()N.....YM....R,Y.
Boot0007* UEFI PXEv4 (MAC:507C6F1AC892)	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x1)/MAC(507c6f1ac892,1)/IPv4(0.0.0.00.0.0.0,0,0)N.....YM....R,Y.
Boot0008* UEFI PXEv6 (MAC:507C6F1AC892)	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x1)/MAC(507c6f1ac892,1)/IPv6([::]:<->[::]:,0,0)N.....YM....R,Y.
Boot0009* UEFI HTTPs Boot 3	PciRoot(0x0)/Pci(0x1b,0x0)/Pci(0x0,0x1)/MAC(507c6f1ac892,1)/IPv4(0.0.0.00.0.0.0,0,0)/Uri()N.....YM....R,Y.
Boot000A* mzos	PciRoot(0x0)/Pci(0x6,0x0)/Pci(0x0,0x0)/NVMe(0x1,00-00-00-00-00-00-00-00)/HD(1,GPT,cada89bc-aee6-4593-accf-e9c908fb3800,0x4000,0x80000)/File(\EFI\debian\shimx64.efi)
Boot000B* debian	HD(1,GPT,cada89bc-aee6-4593-accf-e9c908fb3800,0x4000,0x80000)/File(\EFI\debian\shimx64.efi)

The problem seems to happen only occasionally but not always. Particularly, it seems to happen if we unplug the power during boot and then replug and boot.

grub.cfg generated by mender-convert
#
# DO NOT EDIT THIS FILE
#
# It is automatically generated by grub-mkconfig using templates
# from /etc/grub.d and settings from /etc/default/grub
#

### BEGIN /etc/grub.d/00_00_mender_grubenv_defines ###
mender_rootfsa_part=2
mender_rootfsb_part=3
mender_grub_storage_device=hd0
kernel_imagetype=kernel
initrd_imagetype=initrd
mender_rootfsa_uuid=cada89bc-aee6-4593-accf-e9c908fb3801
mender_rootfsb_uuid=cada89bc-aee6-4593-accf-e9c908fb3802

### END /etc/grub.d/00_00_mender_grubenv_defines ###

### BEGIN /etc/grub.d/00_04_mender_setup_env_functions_grub ###
# See the grub-mender-grubenv-print script for how this works.

# In this file we are skipping signature checking in most places. This is
# because Mender's environment is by nature dynamic, and cannot have a static
# signature. Instead, we make sure the content is valid.

# Free form variables can not be supported when signatures are
# enforced. "mender_systemd_machine_id" is such a variable, so it is not
# supported when signatures are on.

# Note that Secure Boot and GRUB signatures are two different things, and here
# we are talking about the latter.

function mender_setup_env_location {
    MENDER_ENV1=(${root})/grub-mender-grubenv/mender_grubenv1/env
    MENDER_LOCK1=(${root})/grub-mender-grubenv/mender_grubenv1/lock
    MENDER_ENV2=(${root})/grub-mender-grubenv/mender_grubenv2/env
    MENDER_LOCK2=(${root})/grub-mender-grubenv/mender_grubenv2/lock

    if [ ! -f ${MENDER_ENV1} -o ! -f ${MENDER_LOCK1} -o ! -f ${MENDER_ENV2} -o ! -f ${MENDER_LOCK2} ]; then
        if [ "${check_signatures}" = "enforce" ]; then
            echo "Signatures are enabled and the environment could not be found. Rebooting in 10 seconds..."
            sleep 10
            reboot
        else
            echo "The environment was not found. Tried to access ${MENDER_ENV1}. Continuing in 10 seconds..."
            sleep 10
            # Fallthrough and continue. Will most likely hit the "Environment is
            # corrupt" section below.
        fi
    fi
}

function mender_check_and_restore_env {
    mender_setup_env_location
    editing=invalid
    load_env --skip-sig --file ${MENDER_LOCK2} editing
    if [ "${editing}" != 0 ]; then
        # See comment about "free form" variables near the top.
        if [ "$check_signatures" = "enforce" ]; then
            load_env --skip-sig --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available
            save_env --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available
        else
            load_env --skip-sig --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
            save_env --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
        fi
        editing=0
        save_env --file ${MENDER_LOCK2} editing
    else
        editing=invalid
        load_env --skip-sig --file ${MENDER_LOCK1} editing
        if [ "${editing}" != 0 ]; then
            # See comment about "free form" variables near the top.
            if [ "$check_signatures" = "enforce" ]; then
                load_env --skip-sig --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available
                save_env --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available
            else
                load_env --skip-sig --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
                save_env --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
            fi
            editing=0
            save_env --file ${MENDER_LOCK1} editing
        fi
    fi
}

function mender_save_env {
    # Save redundant environment.
    mender_setup_env_location
    editing=1
    save_env --file ${MENDER_LOCK2} editing
    # See comment about "free form" variables near the top.
    if [ "$check_signatures" = "enforce" ]; then
        save_env --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available
    else
        save_env --file ${MENDER_ENV2} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
    fi
    editing=0
    save_env --file ${MENDER_LOCK2} editing

    editing=1
    save_env --file ${MENDER_LOCK1} editing
    # See comment about "free form" variables near the top.
    if [ "$check_signatures" = "enforce" ]; then
        save_env --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available
    else
        save_env --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
    fi
    editing=0
    save_env --file ${MENDER_LOCK1} editing
}

function mender_check_grubenv_valid {
    if [ "${mender_boot_part}" != "${mender_rootfsa_part}" -a "${mender_boot_part}" != "${mender_rootfsb_part}" ]; then
        return 1
    fi

    if [ "${bootcount}" != "0" -a "${bootcount}" != "1" ]; then
        return 1
    fi

    if [ "${upgrade_available}" != "0" -a "${upgrade_available}" != "1" ]; then
        return 1
    fi

    return 0
}

function mender_load_env {
    mender_setup_env_location

    # See comment about "free form" variables near the top.
    if [ "$check_signatures" = "enforce" ]; then
        load_env --skip-sig --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available
    else
        load_env --skip-sig --file ${MENDER_ENV1} bootcount mender_boot_part upgrade_available mender_systemd_machine_id
        export mender_systemd_machine_id
    fi
    export bootcount
    export mender_boot_part
    export upgrade_available

    if ! mender_check_grubenv_valid; then
        if [ "${check_signatures}" = "enforce" ]; then
            echo "Signatures are enabled and the environment is unverified. Rebooting in 10 seconds..."
            sleep 10
            reboot
        else
            if [ "${mender_boot_part}" != "${mender_rootfsb_part}" ]; then
                mender_boot_part="${mender_rootfsa_part}"
            fi
            echo "The environment is corrupt. Trying to boot from ${mender_kernel_root_base}${mender_boot_part} in 10 seconds, but this is not guaranteed to be a valid partition..."
            sleep 10
            # Fallthrough and continue.
        fi
    fi
}

function mender_load_env_with_rollback {
    mender_load_env

    if [ "${upgrade_available}" = "1" ]; then
        if [ "${bootcount}" != "0" ]; then
            echo "Rolling back..."
            if [ "${mender_boot_part}" = "${mender_rootfsa_part}" ]; then
                mender_boot_part="${mender_rootfsb_part}"
            else
                mender_boot_part="${mender_rootfsa_part}"
            fi
            upgrade_available=0
            bootcount=0
        else
            echo "Booting new update..."
            bootcount=1
        fi

        mender_save_env
    fi
}

### END /etc/grub.d/00_04_mender_setup_env_functions_grub ###

### BEGIN /etc/grub.d/00_05_mender_setup_env_grub ###
mender_check_and_restore_env
mender_load_env_with_rollback
regexp (.*),(.*) $root -s mender_grub_storage_device
### END /etc/grub.d/00_05_mender_setup_env_grub ###

### BEGIN /etc/grub.d/00_80_mender_choose_partitions_grub ###
# Historical note: The "mender_boot_part" variable means "partition to use as
# root filesystem while booting", not "the boot partition". So it would be
# better if it was named "mender_rootfs_part", but we can't rename it for
# compatibility reasons. The rest of the variable names follow the latter
# logic.

if [ "${mender_boot_part}" = "${mender_rootfsa_part}" -a test -n "${mender_kernela_part}" ]; then
    mender_ptable_part=${mender_kernela_part}
    mender_kernel_path=""
elif [ "${mender_boot_part}" = "${mender_rootfsb_part}" -a test -n "${mender_kernelb_part}" ]; then
    mender_ptable_part=${mender_kernelb_part}
    mender_kernel_path=""
else
    mender_ptable_part=${mender_boot_part}
    mender_kernel_path="/boot"
fi

if test -e (${mender_grub_storage_device},gpt${mender_ptable_part})/; then
    root="${mender_grub_storage_device},gpt${mender_ptable_part}"
else
    root="${mender_grub_storage_device},msdos${mender_ptable_part}"
fi

if test -n "${mender_rootfsa_uuid}" -a test -n  "${mender_rootfsb_uuid}"; then
    if [ "${mender_boot_part}" = "${mender_rootfsa_part}" ]; then
        mender_kernel_root="PARTUUID=${mender_rootfsa_uuid}"
    elif [ "${mender_boot_part}" = "${mender_rootfsb_part}" ]; then
        mender_kernel_root="PARTUUID=${mender_rootfsb_uuid}"
    fi
else
    mender_kernel_root="${mender_kernel_root_base}${mender_boot_part}"
fi

### END /etc/grub.d/00_80_mender_choose_partitions_grub ###

### BEGIN /etc/grub.d/00_90_mender_boot_selected_rootfs ###
configfile /boot/grub-mender-grubenv.cfg
if [ "${upgrade_available}" = "1" ]; then
    echo "Returned from rootfs boot script. This could mean that the currently selected rootfs is corrupt, and the upgrade failed. Rebooting in 10 seconds..."
    sleep 10
    reboot
elif [ "${check_signatures}" = "enforce" ]; then
    echo "Returned from rootfs boot script. This could mean that the currently selected rootfs is corrupt. But there is no upgrade in progress, and Signature Verification is on, so there is nothing we can do. Shutting down in 10 seconds..."
    sleep 10
    halt
else
    echo "Returned from rootfs boot script. This could mean that the currently selected rootfs is corrupt. But there is no upgrade in progress, so dropping to prompt in 10 seconds..."
    sleep 10
    normal_exit
fi
### END /etc/grub.d/00_90_mender_boot_selected_rootfs ###

### BEGIN /etc/grub.d/00_header ###
if [ -s $prefix/grubenv ]; then
  set have_grubenv=true
  load_env
fi
if [ "${next_entry}" ] ; then
   set default="${next_entry}"
   set next_entry=
   save_env next_entry
   set boot_once=true
else
   set default="0"
fi

if [ x"${feature_menuentry_id}" = xy ]; then
  menuentry_id_option="--id"
else
  menuentry_id_option=""
fi

export menuentry_id_option

if [ "${prev_saved_entry}" ]; then
  set saved_entry="${prev_saved_entry}"
  save_env saved_entry
  set prev_saved_entry=
  save_env prev_saved_entry
  set boot_once=true
fi

function savedefault {
  if [ -z "${boot_once}" ]; then
    saved_entry="${chosen}"
    save_env saved_entry
  fi
}
function load_video {
  if [ x$feature_all_video_module = xy ]; then
    insmod all_video
  else
    insmod efi_gop
    insmod efi_uga
    insmod ieee1275_fb
    insmod vbe
    insmod vga
    insmod video_bochs
    insmod video_cirrus
  fi
}

serial --speed=115200 --unit=0 --word=8 --parity=no --stop=1
terminal_input console serial
terminal_output console serial
if [ "${recordfail}" = 1 ] ; then
  set timeout=30
else
  if [ x$feature_timeout_style = xy ] ; then
    set timeout_style=menu
    set timeout=3
  # Fallback normal timeout code in case the timeout_style feature is
  # unavailable.
  else
    set timeout=3
  fi
fi
### END /etc/grub.d/00_header ###

### BEGIN /etc/grub.d/05_debian_theme ###
set menu_color_normal=cyan/blue
set menu_color_highlight=white/blue
### END /etc/grub.d/05_debian_theme ###

### BEGIN /etc/grub.d/07_mender_choose_partitions_grub ###
# Historical note: The "mender_boot_part" variable means "partition to use as
# root filesystem while booting", not "the boot partition". So it would be
# better if it was named "mender_rootfs_part", but we can't rename it for
# compatibility reasons. The rest of the variable names follow the latter
# logic.

if [ "${mender_boot_part}" = "${mender_rootfsa_part}" -a test -n "${mender_kernela_part}" ]; then
    mender_ptable_part=${mender_kernela_part}
    mender_kernel_path=""
elif [ "${mender_boot_part}" = "${mender_rootfsb_part}" -a test -n "${mender_kernelb_part}" ]; then
    mender_ptable_part=${mender_kernelb_part}
    mender_kernel_path=""
else
    mender_ptable_part=${mender_boot_part}
    mender_kernel_path="/boot"
fi

if test -e (${mender_grub_storage_device},gpt${mender_ptable_part})/; then
    root="${mender_grub_storage_device},gpt${mender_ptable_part}"
else
    root="${mender_grub_storage_device},msdos${mender_ptable_part}"
fi

if test -n "${mender_rootfsa_uuid}" -a test -n  "${mender_rootfsb_uuid}"; then
    if [ "${mender_boot_part}" = "${mender_rootfsa_part}" ]; then
        mender_kernel_root="PARTUUID=${mender_rootfsa_uuid}"
    elif [ "${mender_boot_part}" = "${mender_rootfsb_part}" ]; then
        mender_kernel_root="PARTUUID=${mender_rootfsb_uuid}"
    fi
else
    mender_kernel_root="${mender_kernel_root_base}${mender_boot_part}"
fi

### END /etc/grub.d/07_mender_choose_partitions_grub ###

### BEGIN /etc/grub.d/08_mender_rollback ###
if [ "${upgrade_available}" = "1" ]; then
    menuentry "Roll back latest upgrade" --id mender_rollback {
        reboot
    }

    # Ideally we would have liked to use a string in this "fallback" variable,
    # but because there is a bug in GRUB which accepts only indexes for
    # "fallback", we need to put the menuentry here in the first position, and
    # then use index 0 in "fallback". In a later script we will set "default",
    # which does accept strings, so that we don't boot the fallback by default.

    #fallback=mender_rollback
    fallback=0
fi

### END /etc/grub.d/08_mender_rollback ###

### BEGIN /etc/grub.d/10_linux ###
function gfxmode {
	set gfxpayload="${1}"
}
set linux_gfx_mode=
export linux_gfx_mode
menuentry 'Debian GNU/Linux' --class debian --class gnu-linux --class gnu --class os $menuentry_id_option 'gnulinux-simple-${mender_kernel_root}' {
	load_video
	insmod gzio
	if [ x$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi

	echo	'Loading Linux 5.10.0-18-amd64 ...'
	linux	/boot/vmlinuz-5.10.0-18-amd64 root=${mender_kernel_root} ro  console=tty0 console=ttyS0,115200n8
	echo	'Loading initial ramdisk ...'
	initrd	/boot/initrd.img-5.10.0-18-amd64
}
submenu 'Advanced options for Debian GNU/Linux' $menuentry_id_option 'gnulinux-advanced-${mender_kernel_root}' {
	menuentry 'Debian GNU/Linux, with Linux 5.10.0-18-amd64' --class debian --class gnu-linux --class gnu --class os $menuentry_id_option 'gnulinux-5.10.0-18-amd64-advanced-${mender_kernel_root}' {
		load_video
		insmod gzio
		if [ x$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi
	
		echo	'Loading Linux 5.10.0-18-amd64 ...'
		linux	/boot/vmlinuz-5.10.0-18-amd64 root=${mender_kernel_root} ro  console=tty0 console=ttyS0,115200n8
		echo	'Loading initial ramdisk ...'
		initrd	/boot/initrd.img-5.10.0-18-amd64
	}
	menuentry 'Debian GNU/Linux, with Linux 5.10.0-18-amd64 (recovery mode)' --class debian --class gnu-linux --class gnu --class os $menuentry_id_option 'gnulinux-5.10.0-18-amd64-recovery-${mender_kernel_root}' {
		load_video
		insmod gzio
		if [ x$grub_platform = xxen ]; then insmod xzio; insmod lzopio; fi
	
		echo	'Loading Linux 5.10.0-18-amd64 ...'
		linux	/boot/vmlinuz-5.10.0-18-amd64 root=${mender_kernel_root} ro single 
		echo	'Loading initial ramdisk ...'
		initrd	/boot/initrd.img-5.10.0-18-amd64
	}
}

### END /etc/grub.d/10_linux ###

### BEGIN /etc/grub.d/20_linux_xen ###

### END /etc/grub.d/20_linux_xen ###

### BEGIN /etc/grub.d/30_os-prober ###
### END /etc/grub.d/30_os-prober ###

### BEGIN /etc/grub.d/30_uefi-firmware ###
### END /etc/grub.d/30_uefi-firmware ###

### BEGIN /etc/grub.d/40_custom ###
# This file provides an easy way to add custom menu entries.  Simply type the
# menu entries you want to add after this comment.  Be careful not to change
# the 'exec tail' line above.
### END /etc/grub.d/40_custom ###

### BEGIN /etc/grub.d/41_custom ###
if [ -f  ${config_directory}/custom.cfg ]; then
  source ${config_directory}/custom.cfg
elif [ -z "${config_directory}" -a -f  $prefix/custom.cfg ]; then
  source $prefix/custom.cfg
fi
### END /etc/grub.d/41_custom ###

### BEGIN /etc/grub.d/50_mender_default ###
if [ "${upgrade_available}" = "1" ]; then
    # See mender_rollback.
    if [ -z "${default}" -o "${default}" = "0" ]; then
        # Boot first non-rollback entry.
        default=1
    fi
fi

### END /etc/grub.d/50_mender_default ###

### BEGIN /etc/grub.d/90_mender_generate_dual_rootfs_grub ###
### END /etc/grub.d/90_mender_generate_dual_rootfs_grub ###

I can now reliably reproduce the issue in qemu by adding an empty harddisk before the harddisk that has a mender-converted image. In this configuration the disk with the mender-converted image not hd0 (first disk in grub) but hd1 while hd0 is an empty disk. The mender-convert generated grub configuration fails to boot.

The culprit seems to be that hd0 is hard-coded in mender-grubenv as the boot device for the second stage. The first stage on the ESP tries to detect the boot device, but the second stage on the actual root partition uses just hd0 as the detection code is not included in the second stage.

The relevant grub config parts are

I would suggest always detecting the boot device from the prefix variable in grub. The prefix variable should at any time point to the currently loaded grub config location in grub. So it would avoid cross-device confusion of grub and ensure that the EFI chosen boot device will indeed be booted regardless of its number in grub.