Skip to content

Latest commit

 

History

History
350 lines (271 loc) · 12.3 KB

内核映像组成.md

File metadata and controls

350 lines (271 loc) · 12.3 KB

内核映像组成

1539599177299.png

20190911_210701_54

20190911_210720_96

20190911_211133_78

一级推进系统

20190912_092533_19

boot_params结构

/* The so-called "zeropage" */
struct boot_params {
	struct screen_info screen_info;			/* 0x000 */
	struct apm_bios_info apm_bios_info;		/* 0x040 */
	__u8  _pad2[4];					/* 0x054 */
	__u64  tboot_addr;				/* 0x058 */
	struct ist_info ist_info;			/* 0x060 */
	__u64 acpi_rsdp_addr;				/* 0x070 */
	__u8  _pad3[8];					/* 0x078 */
	__u8  hd0_info[16];	/* obsolete! */		/* 0x080 */
	__u8  hd1_info[16];	/* obsolete! */		/* 0x090 */
	struct sys_desc_table sys_desc_table; /* obsolete! */	/* 0x0a0 */
	struct olpc_ofw_header olpc_ofw_header;		/* 0x0b0 */
	__u32 ext_ramdisk_image;			/* 0x0c0 */
	__u32 ext_ramdisk_size;				/* 0x0c4 */
	__u32 ext_cmd_line_ptr;				/* 0x0c8 */
	__u8  _pad4[116];				/* 0x0cc */
	struct edid_info edid_info;			/* 0x140 */
	struct efi_info efi_info;			/* 0x1c0 */
	__u32 alt_mem_k;				/* 0x1e0 */
	__u32 scratch;		/* Scratch field! */	/* 0x1e4 */
	__u8  e820_entries;				/* 0x1e8 */
	__u8  eddbuf_entries;				/* 0x1e9 */
	__u8  edd_mbr_sig_buf_entries;			/* 0x1ea */
	__u8  kbd_status;				/* 0x1eb */
	__u8  secure_boot;				/* 0x1ec */
	__u8  _pad5[2];					/* 0x1ed */
	/*
	 * The sentinel is set to a nonzero value (0xff) in header.S.
	 *
	 * A bootloader is supposed to only take setup_header and put
	 * it into a clean boot_params buffer. If it turns out that
	 * it is clumsy or too generous with the buffer, it most
	 * probably will pick up the sentinel variable too. The fact
	 * that this variable then is still 0xff will let kernel
	 * know that some variables in boot_params are invalid and
	 * kernel should zero out certain portions of boot_params.
	 */
	__u8  sentinel;					/* 0x1ef */
	__u8  _pad6[1];					/* 0x1f0 */
	struct setup_header hdr;    /* setup header */	/* 0x1f1 */
	__u8  _pad7[0x290-0x1f1-sizeof(struct setup_header)];
	__u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX];	/* 0x290 */
	struct boot_e820_entry e820_table[E820_MAX_ENTRIES_ZEROPAGE]; /* 0x2d0 */
	__u8  _pad8[48];				/* 0xcd0 */
	struct edd_info eddbuf[EDDMAXNR];		/* 0xd00 */
	__u8  _pad9[276];				/* 0xeec */
} __attribute__((packed));

代码链接:https://github.com/torvalds/linux/blob/master/arch/x86/include/uapi/asm/bootparam.h

Linuxn内核跟据不同的架构对bootloader有不同的要求。以x86架构来说,内核会需要bootloader将 struct boot_params 的资料准备好,然后在核心启动时传给核心。内核参数字符串就是包含在其中的 struct setup_header 结构内的 cmd_line_ptr

struct setup_header {
	__u8	setup_sects;
	__u16	root_flags;
  ...
	__u32	cmd_line_ptr;
	__u32	initrd_addr_max;
	__u32	kernel_alignment;
	__u8	relocatable_kernel;
	__u8	min_alignment;
	__u16	xloadflags;
	__u32	cmdline_size;
	__u32	hardware_subarch;
  ...
	__u32	handover_offset;
} __attribute__((packed));

上面是boot_params的结构和上级调用者,那么boot_params实例,如何获取数值呢?因为他需要搜集硬件信息,这个时候连驱动都不完善,怎么搞信息?

  • 两头约定,硬件BIOS系统提供硬件信息,至关重要的内存如何获取?e820中断,int 0x15
  • 显存信息怎么获取?BIOS 0x10中断
  • e820是和BIOS的一个中断相关的,具体说是int 0x15。之所以叫e820是因为在用这个中断时ax必须是0xe820。这个中断的作用是得到系统的内存布局。因为系统内存会有很多段,每段的类型属性也不一样,所以这个查询是“迭代式”的,每次求得一个段。

如下,就是获取显示相关的代码,用到BIOS 0x10中断

static void store_video_mode(void)
{
	struct biosregs ireg, oreg;

	/* N.B.: the saving of the video page here is a bit silly,
	   since we pretty much assume page 0 everywhere. */
	initregs(&ireg);
	ireg.ah = 0x0f;
	intcall(0x10, &ireg, &oreg);

	/* Not all BIOSes are clean with respect to the top bit */
	boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
	boot_params.screen_info.orig_video_page = oreg.bh;
}

代码链接:https://github.com/torvalds/linux/blob/master/arch/x86/boot/video.c

20190912_094337_91

20190912_094533_80

20190912_094643_51

  • BIOS中断,是硬件写死代码操作,其实刷BIOS就是改那段ROM,有些ROM也是可写,只是条件可写
  • BIOS中断向量表存放在物理内存首部0x0000开始
  • 中断向量表其实就是个地址数组,根据编号,跳转到对应的BIOS数据区,其实就是函数地址
  • 地址数组,实模式下地址多大?能多大嘛。。反正访问不超过1MB内存空间,所以干脆从0x00000开始

20190912_095039_34

  • int不是整型int,是汇编指令,可以转换成硬编码,CPU可以直接执行的,也就是说,执行int 0x10,CPU会自动跳转到中断向量表,取出地址作为下一条指令地址。。跟函数调用其实一样一样的,只是他这玩意儿int一下,很骚很方便而已
  • BIOS中断向量表就是个大主板产商统一规范过的,某个中断什么功能,怎么调用
  • 所以操作系统开发,你就需要了解BIOS提供的中断向量表信息,他会告诉你系统的各种信息,虽然很精简,但是足够你通过这些信息跑起操作系统

BIOS中向量表,调用一个BIOS中断,如果传参数,怎么搞?

  • BIOS参数通过硬件寄存器来,而不是内存
  • C语言默认参数都是通过内存堆栈

新问题来了,C语言如何,让函数通过寄存器传参数?书上后序就提到,通过编译器关键字修饰

20190912_100122_44

代码链接:https://github.com/torvalds/linux/blob/master/arch/x86/boot/Makefile

KBUILD_CFLAGS	:= $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS	:= $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
UBSAN_SANITIZE := n

代码链接:https://github.com/torvalds/linux/blob/master/arch/x86/Makefile

# How to compile the 16-bit code.  Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
# older versions of GCC, include an *assembly* header to make sure that
# gcc doesn't play any games behind our back.
CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
M16_CFLAGS	 := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))

REALMODE_CFLAGS	:= $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
		   -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
		   -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
		   -mno-mmx -mno-sse

REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
export REALMODE_CFLAGS

追踪定义,还是可以看到 -mregparm=3

20200204_212818_20

  • 现在虽然有很多引导加载器,grub首当其冲,嵌入式下uboot,但是内核仍然需要在启动时候重新搜集一下硬件信息
  • 启动加载器可以将系统举起来,其实很重要一点就是可以引导多种OS,也很明显,各个OS都提供被引导的接口
  • 启动加载器和OS如何传递参数,其实就是写到内核第一个进程的PCB数据结构里面,然后内核构建伪文件系统后可以通过/proc/cmdline查看到

一级推进任务

  • 获取硬件信息(e820等BIOS中断,存放在boot_params)
  • 启动参数传递
  • 实模式转换保护模式

二级推进系统

20190912_103535_43

20190912_103436_57

二级推进任务(已经处于保护模式)

  • 重定位内核

有效载荷

20200204_213115_56

为什么要压缩内核?还不是老机器容量太小,能节俭就尽可能节俭

压缩重要意义,IO加载内核速度 远远小于 CPU执行速度。可以提升系统启动速度。

映像格式

20190912_104725_95

裸二进制格式,就是加载CPU,PC指向其头部就可以解析硬编码执行。。最原始,最奔放的执行方式

20190912_104848_54

裸二进制其实有很多局限性,尤其是兼容性不够,不利于推广

因此内核2.6.26新增特性,parse_elf解析ELF格式二进制内核。ELF格式也是个值得搞得内容,记得谢写写解析器

代码链接:https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/misc.c

/*
 * The compressed kernel image (ZO), has been moved so that its position
 * is against the end of the buffer used to hold the uncompressed kernel
 * image (VO) and the execution environment (.bss, .brk), which makes sure
 * there is room to do the in-place decompression. (See header.S for the
 * calculations.)
 *
 *                             |-----compressed kernel image------|
 *                             V                                  V
 * 0                       extract_offset                      +INIT_SIZE
 * |-----------|---------------|-------------------------|--------|
 *             |               |                         |        |
 *           VO__text      startup_32 of ZO          VO__end    ZO__end
 *             ^                                         ^
 *             |-------uncompressed kernel image---------|
 *
 */
asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
				  unsigned char *input_data,
				  unsigned long input_len,
				  unsigned char *output,
				  unsigned long output_len)
{
	const unsigned long kernel_total_size = VO__end - VO__text;
	unsigned long virt_addr = LOAD_PHYSICAL_ADDR;

	/* Retain x86 boot parameters pointer passed from startup_32/64. */
	boot_params = rmode;

	/* Clear flags intended for solely in-kernel use. */
	boot_params->hdr.loadflags &= ~KASLR_FLAG;

	sanitize_boot_params(boot_params);

	if (boot_params->screen_info.orig_video_mode == 7) {
		vidmem = (char *) 0xb0000;
		vidport = 0x3b4;
	} else {
		vidmem = (char *) 0xb8000;
		vidport = 0x3d4;
	}

...

	parse_elf(output);
	handle_relocations(output, output_len, virt_addr);
	debug_putstr("done.\nBooting the kernel.\n");
	return output;
}
static void parse_elf(void *output)
{
#ifdef CONFIG_X86_64
	Elf64_Ehdr ehdr;
	Elf64_Phdr *phdrs, *phdr;
#else
	Elf32_Ehdr ehdr;
	Elf32_Phdr *phdrs, *phdr;
#endif
	void *dest;
	int i;

	memcpy(&ehdr, output, sizeof(ehdr));
	if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
	   ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
	   ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
	   ehdr.e_ident[EI_MAG3] != ELFMAG3) {
		error("Kernel is not a valid ELF file");
		return;
	}

	debug_putstr("Parsing ELF... ");

	phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
	if (!phdrs)
		error("Failed to allocate space for phdrs");

	memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);

	for (i = 0; i < ehdr.e_phnum; i++) {
		phdr = &phdrs[i];

		switch (phdr->p_type) {
		case PT_LOAD:
#ifdef CONFIG_X86_64
			if ((phdr->p_align % 0x200000) != 0)
				error("Alignment of LOAD segment isn't multiple of 2MB");
#endif
#ifdef CONFIG_RELOCATABLE
			dest = output;
			dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR);
#else
			dest = (void *)(phdr->p_paddr);
#endif
			memmove(dest, output + phdr->p_offset, phdr->p_filesz);
			break;
		default: /* Ignore other PT_* */ break;
		}
	}

	free(phdrs);
}

书上还提到闹着玩儿的内容,去掉ELF解析

20190912_105325_20

20200204_213226_28


参考链接