From apw Mon Jan 16 03:19:50 2006 Date: Mon, 16 Jan 2006 03:19:50 -0500 To: Jimi Xenidis Subject: [patch] (take 4) asynchronous zeroing framework This patch has the SPE's DMA engine doing the zeroing. I have put it under heavy stress from userspace, verifying that each byte in the pages returned is indeed zero. Can you look it over though? I am a little uncertain about the cost of that tag status register read. It also adds two writable nodes in /proc/sys/vm/{limbo_lo,zeroed_lo} for tuning the lists from userspace. Signed-off-by: Amos Waterland --- diff --git a/Makefile b/Makefile index 34191fe..370c210 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 3 -EXTRAVERSION = -mem-1 +EXTRAVERSION = -mem15 NAME=Feisty Dunnart # *DOCUMENTATION* diff --git a/arch/ppc64/kernel/Makefile b/arch/ppc64/kernel/Makefile index a9b2532..0cc6b97 100644 --- a/arch/ppc64/kernel/Makefile +++ b/arch/ppc64/kernel/Makefile @@ -75,6 +75,7 @@ obj-$(CONFIG_BPA) += bpa_init.o spu_vmops.o spu_tprof.o \ bpa_lpar.o spu_softirq.o \ spu_traps.o +obj-$(CONFIG_BPA) += cell_prezero.o obj-$(CONFIG_PROC_FS) += bpa_proc.o diff --git a/arch/ppc64/kernel/bpa_init.c b/arch/ppc64/kernel/bpa_init.c index 41c590c..8111b2f 100644 --- a/arch/ppc64/kernel/bpa_init.c +++ b/arch/ppc64/kernel/bpa_init.c @@ -452,6 +452,8 @@ int __init bpa_init_phase_1(void) return (0); } +void cell_pgzero_init(int cpu); + int __init bpa_init_phase_2(void) { int i, cpu; @@ -484,6 +486,9 @@ int __init bpa_init_phase_2(void) spu_enable_ls_decode(cpu, i); } } + if (cpu == 0) { + cell_pgzero_init(cpu); + } } /* nr_spus_online must be initilized for both processors before diff --git a/arch/ppc64/kernel/cell_prezero.c b/arch/ppc64/kernel/cell_prezero.c new file mode 100644 index 0000000..62aede7 --- /dev/null +++ b/arch/ppc64/kernel/cell_prezero.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +static int cp_spe; + +void cell_pgzero_init(int cpu) +{ + int spes = __bp_paca[cpu].spus_online; + int spe = fls(spes); + SPU_Priv1 volatile *priv1; + void *ls; + + /* spe 8 is bogus */ + spe = 0; + + printk("Removing SPE %d from general use\n", spe); + clear_bit(spe, &__bp_paca[cpu].spus_online); + --__bp_paca[cpu].nr_spus_online; + + priv1 = __get_spu_priv1(cpu, spe); + printk("Found CPU %d: SPE %d: priv1 area at: 0x%p\n", cpu, spe, priv1); + printk("Turning on MFC master run control of SPE %d\n", spe); + priv1->mfc_sr1_RW |= MFC_STATE1_MASTER_RUN_CONTROL_MASK; + eieio(); + + ls = (void *)__get_spu_ls(cpu, spe); + printk("Zeroing local store of SPE %d\n", spe); + memset(ls, 0, 1 << LOG2_LS_SIZE); + + cp_spe = spe; +} + +static int ppe_queue_prezero(unsigned pa) +{ + int cpu = 0; + unsigned ls = 0; + unsigned sz = PAGE_SIZE; + unsigned tag = 1; + unsigned rclass = 0; + unsigned op = MFC_PUT_CMD; + unsigned tag_mask = 0; /* cannot modify tagmask until completion */ + unsigned qtype = MFC_PU_DMA_QUERYTYPE_ANY; + unsigned flags = 1; /* block until this command hits the queue */ + int rc; + + rc = spu_mfc_dma(cpu, cp_spe, pa, ls, sz, tag, rclass, + op, tag_mask, qtype, flags); + return rc; + +} + +int cell_prezero(struct free_area *area) +{ + struct list_head *curr; + struct page *page; + void *addr; + struct SPU_Problem volatile *prob = __get_spu_prob(0, cp_spe); + + /* If work was outstanding, return 1 if done, 0 otherwise. */ + if (area->nr_queued) { + prob->dma_querytype_RW = MFC_PU_DMA_QUERYTYPE_ANY; + eieio(); + + if (prob->dma_tagstatus_R) { + area->nr_queued = 0; + return 1; + } + return 0; + } + + /* MFC is ready for more work, so queue it up baby. */ + BUG_ON(area->nr_queued); + list_for_each(curr, &area->limbo_list) { + page = list_entry(curr, struct page, list); + addr = page_address(page); + ppe_queue_prezero(__pa(addr)); + area->nr_queued++; + SetPageZeroed(page); + } + + /* We can set the task mask only once for this batch. */ + prob->dma_querymask_RW = 0x1 << 1; + eieio(); + + return 0; +} diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index dcb20c5..af2fd7d 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -99,6 +99,15 @@ static int loadavg_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int prezero_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf(page,"%d %d\n", prezero_hits, prezero_reqs); + return proc_calc_metrics(page, start, off, count, eof, len); +} + struct vmalloc_info { unsigned long used; unsigned long largest_chunk; @@ -666,6 +675,7 @@ void __init proc_misc_init(void) #endif {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"prezero", prezero_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c969542..6960d69 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -32,6 +32,7 @@ #define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ #define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ #define __GFP_NO_GROW 0x2000 /* Slab internal usage */ +#define __GFP_ZERO 0x8000u /* Return zeroed page on success */ #define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 179f51f..b2dfa4f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -23,6 +23,11 @@ struct free_area { struct list_head free_list; unsigned long *map; + struct list_head limbo_list; + unsigned long nr_limbo; + struct list_head zeroed_list; + unsigned long nr_zeroed; + long nr_queued; }; struct pglist_data; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b35e71..b67abb8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -75,6 +75,7 @@ #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ #define PG_reclaim 18 /* To be reclaimed asap */ #define PG_compound 19 /* Part of a compound page */ +#define PG_zeroed 21 /* Page has been zeroed */ /* @@ -270,6 +271,11 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageZeroed(page) test_bit(PG_zeroed, &(page)->flags) +#define SetPageZeroed(page) set_bit(PG_zeroed, &(page)->flags) +#define ClearPageZeroed(page) clear_bit(PG_zeroed, &(page)->flags) +#define TestSetPageZeroed(page) test_and_set_bit(PG_zeroed, &(page)->flags) + /* * The PageSwapCache predicate doesn't use a PG_flag at this time, * but it may again do so one day. diff --git a/include/linux/slab.h b/include/linux/slab.h index 69be5b3..adfdeec 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -119,6 +119,9 @@ void ptrinfo(unsigned long addr); extern atomic_t slab_reclaim_pages; +extern long prezero_hits; +extern long prezero_reqs; + #endif /* __KERNEL__ */ #endif /* _LINUX_SLAB_H */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index dc4167a..531ebfd 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -156,6 +156,8 @@ enum VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_LIMBO_LIST_DEPTH=22, + VM_ZEROED_LIST_DEPTH=23, }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1424811..f8c9d2c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -77,6 +77,8 @@ extern int dirty_background_ratio; extern int vm_dirty_ratio; extern int dirty_writeback_centisecs; extern int dirty_expire_centisecs; +extern int limbo_lo; +extern int zeroed_lo; struct ctl_table; struct file; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5bfd5cd..1142615 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -666,6 +666,22 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = VM_LIMBO_LIST_DEPTH, + .procname = "limbo_lo", + .data = &limbo_lo, + .maxlen = sizeof(limbo_lo), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_ZEROED_LIST_DEPTH, + .procname = "zeroed_lo", + .data = &zeroed_lo, + .maxlen = sizeof(zeroed_lo), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_NR_PDFLUSH_THREADS, .procname = "nr_pdflush_threads", .data = &nr_pdflush_threads, diff --git a/mm/memory.c b/mm/memory.c index da3728a..4b8ebbc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -94,10 +94,6 @@ EXPORT_SYMBOL(high_memory); */ static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) { - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } copy_user_highpage(to, from, address); } @@ -1044,10 +1040,15 @@ static int do_wp_page(struct mm_struct * pte_chain = pte_chain_alloc(GFP_KERNEL); if (!pte_chain) goto no_pte_chain; - new_page = alloc_page(GFP_HIGHUSER); + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); + } else { + new_page = alloc_page(GFP_HIGHUSER); + } if (!new_page) goto no_new_page; - copy_cow_page(old_page,new_page,address); + if (old_page != ZERO_PAGE(address)) + copy_cow_page(old_page,new_page,address); /* * Re-check the pte - we dropped the lock @@ -1344,10 +1345,9 @@ do_anonymous_page(struct mm_struct *mm, pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) goto no_mem; - clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c73033e..cdbf1c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,12 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; +int limbo_lo = 4; +int zeroed_lo = 256; + +long prezero_hits = 0; +long prezero_reqs = 0; + /* * Temporary debugging check for pages not lying within a given zone. */ @@ -337,19 +343,49 @@ static void prep_new_page(struct page *p set_page_refs(page, order); } +/* This function always returns quickly, so you can call it with + * interrupts disabled. It will not return 1 until every page on the + * passed area's limbo list has been zeroed. This means that you can + * call it repeatedly while the asynchronous zeroing is going: it will + * just immediately return 0. You must not modify the limbo list + * until it returns 1. When you call it with a new limbo list, it + * does not return until all the list entries have been queued for + * zeroing, so do not make your list too long. + */ +static int do_prezero(struct free_area *area) +{ + extern int cell_prezero(struct free_area *area); + return cell_prezero(area); +} + /* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *__rmqueue(struct zone *zone, unsigned int order, + unsigned int gfp_mask) { struct free_area * area; unsigned int current_order; struct page *page; unsigned int index; + if (gfp_mask & __GFP_ZERO) prezero_reqs++; + for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = zone->free_area + current_order; + + if (area->nr_zeroed && (gfp_mask & __GFP_ZERO)) { + prezero_hits++; + page = list_entry(area->zeroed_list.next, + struct page, list); + list_del(&page->list); + area->nr_zeroed--; + index = page - zone->zone_mem_map; + return expand(zone, page, index, order, + current_order, area); + } + if (list_empty(&area->free_list)) continue; @@ -359,6 +395,25 @@ static struct page *__rmqueue(struct zon if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); zone->free_pages -= 1UL << order; + + if (area->nr_limbo < limbo_lo) { + if ((gfp_mask & __GFP_ZERO) && (current_order == 0)) { + list_add(&page->list, &area->limbo_list); + area->nr_limbo++; + current_order--; + continue; + } + } + + if (area->nr_limbo >= limbo_lo && area->nr_zeroed < zeroed_lo) { + if (do_prezero(area)) { + list_splice_init(&area->limbo_list, + &area->zeroed_list); + area->nr_zeroed += area->nr_limbo; + area->nr_limbo = 0; + } + } + return expand(zone, page, index, order, current_order, area); } @@ -371,7 +426,8 @@ static struct page *__rmqueue(struct zon * Returns the number of new pages which were placed at *list. */ static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) + unsigned long count, struct list_head *list, + unsigned int gfp_mask) { unsigned long flags; int i; @@ -380,7 +436,7 @@ static int rmqueue_bulk(struct zone *zon spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, gfp_mask); if (page == NULL) break; allocated++; @@ -472,13 +528,24 @@ void free_cold_page(struct page *page) free_hot_cold_page(page, 1); } +static inline void +prep_zero_page(struct page *page, int order, unsigned int gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + /* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) +static struct page *buffered_rmqueue(struct zone *zone, int order, int cold, + unsigned int gfp_mask) { unsigned long flags; struct page *page = NULL; @@ -490,7 +557,7 @@ static struct page *buffered_rmqueue(str local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + pcp->batch, &pcp->list, gfp_mask); if (pcp->count) { page = list_entry(pcp->list.next, struct page, list); list_del(&page->list); @@ -502,7 +569,7 @@ static struct page *buffered_rmqueue(str if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); + page = __rmqueue(zone, order, gfp_mask); spin_unlock_irqrestore(&zone->lock, flags); if (order && page) prep_compound_page(page, order); @@ -512,7 +579,14 @@ static struct page *buffered_rmqueue(str BUG_ON(bad_range(zone, page)); mod_page_state(pgalloc, 1 << order); prep_new_page(page, order); + + if (gfp_mask & __GFP_ZERO) { + if (!PageZeroed(page)) + prep_zero_page(page, order, gfp_mask); + } + } + ClearPageZeroed(page); return page; } @@ -573,7 +647,7 @@ __alloc_pages(unsigned int gfp_mask, uns if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); + page = buffered_rmqueue(z, order, cold, gfp_mask); if (page) goto got_pg; } @@ -598,7 +672,7 @@ __alloc_pages(unsigned int gfp_mask, uns min += local_min; if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); + page = buffered_rmqueue(z, order, cold, gfp_mask); if (page) goto got_pg; } @@ -613,7 +687,7 @@ rebalance: for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - page = buffered_rmqueue(z, order, cold); + page = buffered_rmqueue(z, order, cold, gfp_mask); if (page) goto got_pg; } @@ -641,7 +715,7 @@ rebalance: min += z->pages_min; if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { - page = buffered_rmqueue(z, order, cold); + page = buffered_rmqueue(z, order, cold, gfp_mask); if (page) goto got_pg; } @@ -1321,8 +1395,11 @@ static void __init free_area_init_core(s unsigned long bitmap_size; INIT_LIST_HEAD(&zone->free_area[i].free_list); + INIT_LIST_HEAD(&zone->free_area[i].limbo_list); + INIT_LIST_HEAD(&zone->free_area[i].zeroed_list); if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; + printk("Zone %li has %li limbo lists\n", j, i); break; }