The version contains the following patches with last-minute fixes for the -mm tree, plus Con Kolivas 1 Gb lowmem, plus an up-to-date device mapper code compatible with EVMS, Nvidia compatibility, the emergency icmp code and stability fixes from 2.6.11.4. 00011.patch 00012.patch 00013.patch 00014.patch 1g_lowmem1_i386.diff bd-claim.patch display_default_scheduler.patch dm-bbr.patch emergency_ping.patch gcc4_fbcon_compile_fix.patch isicom_section_fixes.patch jiffies.patch nfs_o_direct_fix.patch nfs_select_without_exportfs.patch ntfs-rw-compile-fix.patch nvidia_6111-6629_compat2.diff ppp_async_security_fix.patch slab-shrinkers-use-vfs_cache_pressure.patch sleep_on_fix.patch vmscan-notice-slab-shrinking.patch Device mapper updates from 2.6.11-rc3-udm2 EVMS additions from EVMS 2.5.2 -jedi tree is available from ftp://ftp.c9x.org/linux-kernel/ -mm tree (apply before -jedi patch) is available from ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/ -jedi tree contains important last-minute fixes against the latest good-looking -mm tree, including the fixes from the -stable branch. This tree is designed to be stable for workstation or server usage. Every release is tested on loaded x86 and amd64 production servers. Please send a mail to kernel@pureftpd.org in order to stay informed about new releases. diff -pruN linux-2.6.11-mm4/arch/i386/Kconfig linux-2.6.11-mm4-jedi1/arch/i386/Kconfig --- linux-2.6.11-mm4/arch/i386/Kconfig 2005-03-16 14:40:47.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/arch/i386/Kconfig 2005-03-16 15:18:00.000000000 +0100 @@ -757,6 +757,20 @@ config X86_PAE depends on HIGHMEM64G default y +config 1GLOWMEM + bool "1Gb Low Memory Support" + depends on NOHIGHMEM + default n + help + Linux on i386 architecture normally supports just 896Mb without + enabling HIGHMEM support. This option will enable you to support 1Gb + of ram without needing to enable HIGHMEM support. The advantage of + this is that you don't need the extra overhead of high memory support + to utilise the last 128Mb of ram. However this may break drivers such + as vmware. + + If unsure say "no" + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff -pruN linux-2.6.11-mm4/arch/i386/mm/init.c linux-2.6.11-mm4-jedi1/arch/i386/mm/init.c --- linux-2.6.11-mm4/arch/i386/mm/init.c 2005-03-16 14:40:47.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/arch/i386/mm/init.c 2005-03-16 15:22:18.000000000 +0100 @@ -41,6 +41,7 @@ #include unsigned int __VMALLOC_RESERVE = 128 << 20; +EXPORT_SYMBOL(__VMALLOC_RESERVE); DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; diff -pruN linux-2.6.11-mm4/drivers/block/elevator.c linux-2.6.11-mm4-jedi1/drivers/block/elevator.c --- linux-2.6.11-mm4/drivers/block/elevator.c 2005-03-16 14:40:55.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/block/elevator.c 2005-03-16 15:16:31.000000000 +0100 @@ -180,6 +180,8 @@ static int __init elevator_setup(char *s __setup("elevator=", elevator_setup); +static int default_msg = 0; + int elevator_init(request_queue_t *q, char *name) { struct elevator_type *e = NULL; @@ -195,6 +197,12 @@ int elevator_init(request_queue_t *q, ch if (!e) return -EINVAL; + if (!default_msg && !strcmp(e->elevator_name, chosen_elevator)) { + printk(KERN_INFO "using %s as default io scheduler\n", + chosen_elevator); + default_msg = 1; + } + eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL); if (!eq) { elevator_put(e->elevator_type); @@ -554,10 +562,7 @@ int elv_register(struct elevator_type *e list_add_tail(&e->list, &elv_list); spin_unlock_irq(&elv_list_lock); - printk(KERN_INFO "io scheduler %s registered", e->elevator_name); - if (!strcmp(e->elevator_name, chosen_elevator)) - printk(" (default)"); - printk("\n"); + printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); return 0; } EXPORT_SYMBOL_GPL(elv_register); diff -pruN linux-2.6.11-mm4/drivers/char/isicom.c linux-2.6.11-mm4-jedi1/drivers/char/isicom.c --- linux-2.6.11-mm4/drivers/char/isicom.c 2005-03-16 14:40:56.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/char/isicom.c 2005-03-16 15:21:18.000000000 +0100 @@ -1756,7 +1756,7 @@ static void isicom_flush_buffer(struct t } -static int __init register_ioregion(void) +static int __devinit register_ioregion(void) { int count, done=0; for (count=0; count < BOARD_COUNT; count++ ) { @@ -1771,7 +1771,7 @@ static int __init register_ioregion(void return done; } -static void __exit unregister_ioregion(void) +static void unregister_ioregion(void) { int count; for (count=0; count < BOARD_COUNT; count++ ) @@ -1803,7 +1803,7 @@ static struct tty_operations isicom_ops .tiocmset = isicom_tiocmset, }; -static int __init register_drivers(void) +static int __devinit register_drivers(void) { int error; @@ -1834,7 +1834,7 @@ static int __init register_drivers(void) return 0; } -static void __exit unregister_drivers(void) +static void unregister_drivers(void) { int error = tty_unregister_driver(isicom_normal); if (error) @@ -1842,7 +1842,7 @@ static void __exit unregister_drivers(vo put_tty_driver(isicom_normal); } -static int __init register_isr(void) +static int __devinit register_isr(void) { int count, done=0; unsigned long irqflags; @@ -1883,7 +1883,7 @@ static void __exit unregister_isr(void) } } -static int __init isicom_init(void) +static int __devinit isicom_init(void) { int card, channel, base; struct isi_port * port; diff -pruN linux-2.6.11-mm4/drivers/md/dm-bbr.c linux-2.6.11-mm4-jedi1/drivers/md/dm-bbr.c --- linux-2.6.11-mm4/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/dm-bbr.c 2005-03-16 15:19:55.000000000 +0100 @@ -0,0 +1,1003 @@ +/* + * (C) Copyright IBM Corp. 2002, 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-bbr.c + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-bio-list.h" +#include "dm-bio-record.h" +#include "dm-bbr.h" +#include "dm-io.h" + +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +static struct workqueue_struct *dm_bbr_wq = NULL; +static void bbr_remap_handler(void *data); +static kmem_cache_t *bbr_remap_cache; +static kmem_cache_t *bbr_io_cache; +static mempool_t *bbr_io_pool; + +/** + * bbr_binary_tree_destroy + * + * Destroy the binary tree. + **/ +static void bbr_binary_tree_destroy(struct bbr_runtime_remap *root) +{ + struct bbr_runtime_remap **link = NULL; + struct bbr_runtime_remap *node = root; + + while (node) { + if (node->left) { + link = &(node->left); + node = node->left; + continue; + } + if (node->right) { + link = &(node->right); + node = node->right; + continue; + } + + kmem_cache_free(bbr_remap_cache, node); + if (node == root) { + /* If root is deleted, we're done. */ + break; + } + + /* Back to root. */ + node = root; + *link = NULL; + } +} + +static void bbr_free_remap(struct bbr_private *bbr_id) +{ + spin_lock_irq(&bbr_id->remap_root_lock); + bbr_binary_tree_destroy(bbr_id->remap_root); + bbr_id->remap_root = NULL; + spin_unlock_irq(&bbr_id->remap_root_lock); +} + +static struct bbr_private *bbr_alloc_private(void) +{ + struct bbr_private *bbr_id; + + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL); + if (bbr_id) { + memset(bbr_id, 0, sizeof(*bbr_id)); + INIT_WORK(&bbr_id->remap_work, bbr_remap_handler, bbr_id); + bbr_id->remap_root_lock = SPIN_LOCK_UNLOCKED; + bbr_id->remap_ios_lock = SPIN_LOCK_UNLOCKED; + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); + } + + return bbr_id; +} + +static void bbr_free_private(struct bbr_private *bbr_id) +{ + if (bbr_id->bbr_table) { + vfree(bbr_id->bbr_table); + } + bbr_free_remap(bbr_id); + kfree(bbr_id); +} + +static u32 crc_table[256]; +static u32 crc_table_built = 0; + +static void build_crc_table(void) +{ + u32 i, j, crc; + + for (i = 0; i <= 255; i++) { + crc = i; + for (j = 8; j > 0; j--) { + if (crc & 1) + crc = (crc >> 1) ^ CRC_POLYNOMIAL; + else + crc >>= 1; + } + crc_table[i] = crc; + } + crc_table_built = 1; +} + +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize) +{ + unsigned char *current_byte; + u32 temp1, temp2, i; + + current_byte = (unsigned char *) buffer; + /* Make sure the crc table is available */ + if (!crc_table_built) + build_crc_table(); + /* Process each byte in the buffer. */ + for (i = 0; i < buffersize; i++) { + temp1 = (crc >> 8) & 0x00FFFFFF; + temp2 = crc_table[(crc ^ (u32) * current_byte) & + (u32) 0xff]; + current_byte++; + crc = temp1 ^ temp2; + } + return crc; +} + +/** + * le_bbr_table_sector_to_cpu + * + * Convert bbr meta data from on-disk (LE) format + * to the native cpu endian format. + **/ +static void le_bbr_table_sector_to_cpu(struct bbr_table *p) +{ + int i; + p->signature = le32_to_cpup(&p->signature); + p->crc = le32_to_cpup(&p->crc); + p->sequence_number = le32_to_cpup(&p->sequence_number); + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { + p->entries[i].bad_sect = + le64_to_cpup(&p->entries[i].bad_sect); + p->entries[i].replacement_sect = + le64_to_cpup(&p->entries[i].replacement_sect); + } +} + +/** + * cpu_bbr_table_sector_to_le + * + * Convert bbr meta data from cpu endian format to on-disk (LE) format + **/ +static void cpu_bbr_table_sector_to_le(struct bbr_table *p, + struct bbr_table *le) +{ + int i; + le->signature = cpu_to_le32p(&p->signature); + le->crc = cpu_to_le32p(&p->crc); + le->sequence_number = cpu_to_le32p(&p->sequence_number); + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { + le->entries[i].bad_sect = + cpu_to_le64p(&p->entries[i].bad_sect); + le->entries[i].replacement_sect = + cpu_to_le64p(&p->entries[i].replacement_sect); + } +} + +/** + * validate_bbr_table_sector + * + * Check the specified BBR table sector for a valid signature and CRC. If it's + * valid, endian-convert the table sector. + **/ +static int validate_bbr_table_sector(struct bbr_table *p) +{ + int rc = 0; + int org_crc, final_crc; + + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) { + DMERR("dm-bbr: BBR table signature doesn't match!"); + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x", + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE); + rc = -EINVAL; + goto out; + } + + if (!p->crc) { + DMERR("dm-bbr: BBR table sector has no CRC!"); + rc = -EINVAL; + goto out; + } + + org_crc = le32_to_cpup(&p->crc); + p->crc = 0; + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p)); + if (final_crc != org_crc) { + DMERR("dm-bbr: CRC failed!"); + DMERR("dm-bbr: Found 0x%x. Expecting 0x%x", + org_crc, final_crc); + rc = -EINVAL; + goto out; + } + + p->crc = cpu_to_le32p(&org_crc); + le_bbr_table_sector_to_cpu(p); + +out: + return rc; +} + +/** + * bbr_binary_tree_insert + * + * Insert a node into the binary tree. + **/ +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root, + struct bbr_runtime_remap *newnode) +{ + struct bbr_runtime_remap **node = root; + while (node && *node) { + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) { + node = &((*node)->right); + } else { + node = &((*node)->left); + } + } + + newnode->left = newnode->right = NULL; + *node = newnode; +} + +/** + * bbr_binary_search + * + * Search for a node that contains bad_sect == lsn. + **/ +static struct bbr_runtime_remap *bbr_binary_search( + struct bbr_runtime_remap *root, + u64 lsn) +{ + struct bbr_runtime_remap *node = root; + while (node) { + if (node->remap.bad_sect == lsn) { + break; + } + if (lsn > node->remap.bad_sect) { + node = node->right; + } else { + node = node->left; + } + } + return node; +} + +/** + * bbr_insert_remap_entry + * + * Create a new remap entry and add it to the binary tree for this node. + **/ +static int bbr_insert_remap_entry(struct bbr_private *bbr_id, + struct bbr_table_entry *new_bbr_entry) +{ + struct bbr_runtime_remap *newnode; + + newnode = kmem_cache_alloc(bbr_remap_cache, GFP_NOIO); + if (!newnode) { + DMERR("dm-bbr: Could not allocate from remap cache!"); + return -ENOMEM; + } + newnode->remap.bad_sect = new_bbr_entry->bad_sect; + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; + spin_lock_irq(&bbr_id->remap_root_lock); + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); + spin_unlock_irq(&bbr_id->remap_root_lock); + return 0; +} + +/** + * bbr_table_to_remap_list + * + * The on-disk bbr table is sorted by the replacement sector LBA. In order to + * improve run time performance, the in memory remap list must be sorted by + * the bad sector LBA. This function is called at discovery time to initialize + * the remap list. This function assumes that at least one copy of meta data + * is valid. + **/ +static u32 bbr_table_to_remap_list(struct bbr_private *bbr_id) +{ + u32 in_use_blks = 0; + int i, j; + struct bbr_table *p; + + for (i = 0, p = bbr_id->bbr_table; + i < bbr_id->nr_sects_bbr_table; + i++, p++) { + if (!p->in_use_cnt) { + break; + } + in_use_blks += p->in_use_cnt; + for (j = 0; j < p->in_use_cnt; j++) { + bbr_insert_remap_entry(bbr_id, &p->entries[j]); + } + } + if (in_use_blks) { + char b[32]; + DMWARN("dm-bbr: There are %u BBR entries for device %s", + in_use_blks, format_dev_t(b, bbr_id->dev->bdev->bd_dev)); + } + + return in_use_blks; +} + +/** + * bbr_search_remap_entry + * + * Search remap entry for the specified sector. If found, return a pointer to + * the table entry. Otherwise, return NULL. + **/ +static struct bbr_table_entry *bbr_search_remap_entry( + struct bbr_private *bbr_id, + u64 lsn) +{ + struct bbr_runtime_remap *p; + + spin_lock_irq(&bbr_id->remap_root_lock); + p = bbr_binary_search(bbr_id->remap_root, lsn); + spin_unlock_irq(&bbr_id->remap_root_lock); + if (p) { + return (&p->remap); + } else { + return NULL; + } +} + +/** + * bbr_remap + * + * If *lsn is in the remap table, return TRUE and modify *lsn, + * else, return FALSE. + **/ +static inline int bbr_remap(struct bbr_private *bbr_id, + u64 *lsn) +{ + struct bbr_table_entry *e; + + if (atomic_read(&bbr_id->in_use_replacement_blks)) { + e = bbr_search_remap_entry(bbr_id, *lsn); + if (e) { + *lsn = e->replacement_sect; + return 1; + } + } + return 0; +} + +/** + * bbr_remap_probe + * + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap + * table return TRUE, Else, return FALSE. + **/ +static inline int bbr_remap_probe(struct bbr_private *bbr_id, + u64 lsn, u64 nr_sects) +{ + u64 tmp, cnt; + + if (atomic_read(&bbr_id->in_use_replacement_blks)) { + for (cnt = 0, tmp = lsn; + cnt < nr_sects; + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { + if (bbr_remap(bbr_id,&tmp)) { + return 1; + } + } + } + return 0; +} + +/** + * bbr_setup + * + * Read the remap tables from disk and set up the initial remap tree. + **/ +static int bbr_setup(struct bbr_private *bbr_id) +{ + struct bbr_table *table = bbr_id->bbr_table; + struct io_region job; + unsigned long error; + int i, rc = 0; + + job.bdev = bbr_id->dev->bdev; + job.count = 1; + + /* Read and verify each BBR table sector individually. */ + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) { + job.sector = bbr_id->lba_table1 + i; + rc = dm_io_sync_vm(1, &job, READ, table, &error); + if (rc && bbr_id->lba_table2) { + job.sector = bbr_id->lba_table2 + i; + rc = dm_io_sync_vm(1, &job, READ, table, &error); + } + if (rc) { + goto out; + } + + rc = validate_bbr_table_sector(table); + if (rc) { + goto out; + } + } + atomic_set(&bbr_id->in_use_replacement_blks, + bbr_table_to_remap_list(bbr_id)); + +out: + if (rc) { + DMERR("dm-bbr: error during device setup: %d", rc); + } + return rc; +} + +/** + * bbr_io_remap_error + * @bbr_id: Private data for the BBR node. + * @rw: READ or WRITE. + * @starting_lsn: Starting sector of request to remap. + * @count: Number of sectors in the request. + * @page: Page containing the data for the request. + * @offset: Byte-offset of the data within the page. + * + * For the requested range, try to write each sector individually. For each + * sector that fails, find the next available remap location and write the + * data to that new location. Then update the table and write both copies + * of the table to disk. Finally, update the in-memory mapping and do any + * other necessary bookkeeping. + **/ +static int bbr_io_remap_error(struct bbr_private *bbr_id, + int rw, + u64 starting_lsn, + u64 count, + struct page *page, + unsigned int offset) +{ + struct bbr_table *bbr_table; + struct io_region job; + struct page_list pl; + unsigned long table_sector_index; + unsigned long table_sector_offset; + unsigned long index; + unsigned long error; + u64 lsn, new_lsn; + char b[32]; + int rc; + + job.bdev = bbr_id->dev->bdev; + job.count = 1; + pl.page = page; + pl.next = NULL; + + /* For each sector in the request. */ + for (lsn = 0; lsn < count; lsn++, offset += SECTOR_SIZE) { + job.sector = starting_lsn + lsn; + rc = dm_io_sync(1, &job, rw, &pl, offset, &error); + while (rc) { + /* Find the next available relocation sector. */ + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); + if (new_lsn >= bbr_id->nr_replacement_blks) { + /* No more replacement sectors available. */ + return -EIO; + } + new_lsn += bbr_id->start_replacement_sect; + + /* Write the data to its new location. */ + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64, + format_dev_t(b, bbr_id->dev->bdev->bd_dev), + starting_lsn + lsn, new_lsn); + job.sector = new_lsn; + rc = dm_io_sync(1, &job, rw, &pl, offset, &error); + if (rc) { + /* This replacement sector is bad. + * Try the next one. + */ + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.", + format_dev_t(b, bbr_id->dev->bdev->bd_dev), new_lsn); + atomic_inc(&bbr_id->in_use_replacement_blks); + continue; + } + + /* Add this new entry to the on-disk table. */ + table_sector_index = new_lsn - + bbr_id->start_replacement_sect; + table_sector_offset = table_sector_index / + BBR_ENTRIES_PER_SECT; + index = table_sector_index % BBR_ENTRIES_PER_SECT; + + bbr_table = &bbr_id->bbr_table[table_sector_offset]; + bbr_table->entries[index].bad_sect = starting_lsn + lsn; + bbr_table->entries[index].replacement_sect = new_lsn; + bbr_table->in_use_cnt++; + bbr_table->sequence_number++; + bbr_table->crc = 0; + bbr_table->crc = calculate_crc(INITIAL_CRC, + bbr_table, + sizeof(struct bbr_table)); + + /* Write the table to disk. */ + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); + if (bbr_id->lba_table1) { + job.sector = bbr_id->lba_table1 + table_sector_offset; + rc = dm_io_sync_vm(1, &job, WRITE, bbr_table, &error); + } + if (bbr_id->lba_table2) { + job.sector = bbr_id->lba_table2 + table_sector_offset; + rc |= dm_io_sync_vm(1, &job, WRITE, bbr_table, &error); + } + le_bbr_table_sector_to_cpu(bbr_table); + + if (rc) { + /* Error writing one of the tables to disk. */ + DMERR("dm-bbr: device %s: error updating BBR tables on disk.", + format_dev_t(b, bbr_id->dev->bdev->bd_dev)); + return rc; + } + + /* Insert a new entry in the remapping binary-tree. */ + rc = bbr_insert_remap_entry(bbr_id, + &bbr_table->entries[index]); + if (rc) { + DMERR("dm-bbr: device %s: error adding new entry to remap tree.", + format_dev_t(b, bbr_id->dev->bdev->bd_dev)); + return rc; + } + + atomic_inc(&bbr_id->in_use_replacement_blks); + } + } + + return 0; +} + +/** + * bbr_io_process_request + * + * For each sector in this request, check if the sector has already + * been remapped. If so, process all previous sectors in the request, + * followed by the remapped sector. Then reset the starting lsn and + * count, and keep going with the rest of the request as if it were + * a whole new request. If any of the sync_io's return an error, + * call the remapper to relocate the bad sector(s). + * + * 2.5 Note: When switching over to bio's for the I/O path, we have made + * the assumption that the I/O request described by the bio is one + * virtually contiguous piece of memory (even though the bio vector + * describes it using a series of physical page addresses). + **/ +static int bbr_io_process_request(struct bbr_private *bbr_id, + struct bio *bio) +{ + struct io_region job; + u64 starting_lsn = bio->bi_sector; + u64 count, lsn, remapped_lsn; + struct page_list pl; + unsigned int offset; + unsigned long error; + int i, rw = bio_data_dir(bio); + int rc = 0; + + job.bdev = bbr_id->dev->bdev; + pl.next = NULL; + + /* Each bio can contain multiple vectors, each with a different page. + * Treat each vector as a separate request. + */ + /* KMC: Is this the right way to walk the bvec list? */ + for (i = 0; + i < bio->bi_vcnt; + i++, bio->bi_idx++, starting_lsn += count) { + + /* Bvec info: number of sectors, page, + * and byte-offset within page. + */ + count = bio_iovec(bio)->bv_len >> SECTOR_SHIFT; + pl.page = bio_iovec(bio)->bv_page; + offset = bio_iovec(bio)->bv_offset; + + /* For each sector in this bvec, check if the sector has + * already been remapped. If so, process all previous sectors + * in this request, followed by the remapped sector. Then reset + * the starting lsn and count and keep going with the rest of + * the request as if it were a whole new request. + */ + for (lsn = 0; lsn < count; lsn++) { + remapped_lsn = starting_lsn + lsn; + rc = bbr_remap(bbr_id, &remapped_lsn); + if (!rc) { + /* This sector is fine. */ + continue; + } + + /* Process all sectors in the request up to this one. */ + if (lsn > 0) { + job.sector = starting_lsn; + job.count = lsn; + rc = dm_io_sync(1, &job, rw, &pl, + offset, &error); + if (rc) { + /* If this I/O failed, then one of the + * sectors in this request needs to be + * relocated. + */ + rc = bbr_io_remap_error(bbr_id, rw, + starting_lsn, + lsn, pl.page, + offset); + if (rc) { + /* KMC: Return? Or continue to next bvec? */ + return rc; + } + } + offset += (lsn << SECTOR_SHIFT); + } + + /* Process the remapped sector. */ + job.sector = remapped_lsn; + job.count = 1; + rc = dm_io_sync(1, &job, rw, &pl, offset, &error); + if (rc) { + /* BUGBUG - Need more processing if this caused + * an error. If this I/O failed, then the + * existing remap is now bad, and we need to + * find a new remap. Can't use + * bbr_io_remap_error(), because the existing + * map entry needs to be changed, not added + * again, and the original table entry also + * needs to be changed. + */ + return rc; + } + + starting_lsn += (lsn + 1); + count -= (lsn + 1); + lsn = -1; + offset += SECTOR_SIZE; + } + + /* Check for any remaining sectors after the last split. This + * could potentially be the whole request, but that should be a + * rare case because requests should only be processed by the + * thread if we know an error occurred or they contained one or + * more remapped sectors. + */ + if (count) { + job.sector = starting_lsn; + job.count = count; + rc = dm_io_sync(1, &job, rw, &pl, offset, &error); + if (rc) { + /* If this I/O failed, then one of the sectors + * in this request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn, + count, pl.page, offset); + if (rc) { + /* KMC: Return? Or continue to next bvec? */ + return rc; + } + } + } + } + + return 0; +} + +static void bbr_io_process_requests(struct bbr_private *bbr_id, + struct bio *bio) +{ + struct bio *next; + int rc; + + while (bio) { + next = bio->bi_next; + bio->bi_next = NULL; + + rc = bbr_io_process_request(bbr_id, bio); + + bio_endio(bio, bio->bi_size, rc); + + bio = next; + } +} + +/** + * bbr_remap_handler + * + * This is the handler for the bbr work-queue. + * + * I/O requests should only be sent to this handler if we know that: + * a) the request contains at least one remapped sector. + * or + * b) the request caused an error on the normal I/O path. + * + * This function uses synchronous I/O, so sending a request to this + * thread that doesn't need special processing will cause severe + * performance degredation. + **/ +static void bbr_remap_handler(void *data) +{ + struct bbr_private *bbr_id = data; + struct bio *bio; + unsigned long flags; + + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags); + bio = bio_list_get(&bbr_id->remap_ios); + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags); + + bbr_io_process_requests(bbr_id, bio); +} + +/** + * bbr_endio + * + * This is the callback for normal write requests. Check for an error + * during the I/O, and send to the thread for processing if necessary. + **/ +static int bbr_endio(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + struct bbr_private *bbr_id = ti->private; + struct dm_bio_details *bbr_io = map_context->ptr; + + if (error && bbr_io) { + unsigned long flags; + char b[32]; + + dm_bio_restore(bbr_io, bio); + map_context->ptr = NULL; + + DMERR("dm-bbr: device %s: I/O failure on sector %lu. " + "Scheduling for retry.", + format_dev_t(b, bbr_id->dev->bdev->bd_dev), + (unsigned long)bio->bi_sector); + + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags); + bio_list_add(&bbr_id->remap_ios, bio); + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags); + + queue_work(dm_bbr_wq, &bbr_id->remap_work); + + error = 1; + } + + if (bbr_io) + mempool_free(bbr_io, bbr_io_pool); + + return error; +} + +/** + * Construct a bbr mapping + **/ +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct bbr_private *bbr_id; + unsigned long block_size; + char *end; + int rc = -EINVAL; + + if (argc != 8) { + ti->error = "dm-bbr requires exactly 8 arguments: " + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size"; + goto out1; + } + + bbr_id = bbr_alloc_private(); + if (!bbr_id) { + ti->error = "dm-bbr: Error allocating bbr private data."; + goto out1; + } + + bbr_id->offset = simple_strtoull(argv[1], &end, 10); + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10); + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10); + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10); + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10); + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10); + block_size = simple_strtoul(argv[7], &end, 10); + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT); + + bbr_id->bbr_table = vmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT); + if (!bbr_id->bbr_table) { + ti->error = "dm-bbr: Error allocating bbr table."; + goto out2; + } + + if (dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &bbr_id->dev)) { + ti->error = "dm-bbr: Device lookup failed"; + goto out2; + } + + rc = bbr_setup(bbr_id); + if (rc) { + ti->error = "dm-bbr: Device setup failed"; + goto out3; + } + + ti->private = bbr_id; + return 0; + +out3: + dm_put_device(ti, bbr_id->dev); +out2: + bbr_free_private(bbr_id); +out1: + return rc; +} + +static void bbr_dtr(struct dm_target *ti) +{ + struct bbr_private *bbr_id = ti->private; + + dm_put_device(ti, bbr_id->dev); + bbr_free_private(bbr_id); +} + +static int bbr_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct bbr_private *bbr_id = ti->private; + struct dm_bio_details *bbr_io; + unsigned long flags; + int rc = 1; + + bio->bi_sector += bbr_id->offset; + + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + !bbr_remap_probe(bbr_id, bio->bi_sector, bio_sectors(bio))) { + /* No existing remaps or this request doesn't + * contain any remapped sectors. + */ + bio->bi_bdev = bbr_id->dev->bdev; + + bbr_io = mempool_alloc(bbr_io_pool, GFP_NOIO); + dm_bio_record(bbr_io, bio); + map_context->ptr = bbr_io; + } else { + /* This request has at least one remapped sector. + * Give it to the work-queue for processing. + */ + map_context->ptr = NULL; + spin_lock_irqsave(&bbr_id->remap_ios_lock, flags); + bio_list_add(&bbr_id->remap_ios, bio); + spin_unlock_irqrestore(&bbr_id->remap_ios_lock, flags); + + queue_work(dm_bbr_wq, &bbr_id->remap_work); + rc = 0; + } + + return rc; +} + +static int bbr_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct bbr_private *bbr_id = ti->private; + char b[BDEVNAME_SIZE]; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u", + format_dev_t(b, bbr_id->dev->bdev->bd_dev), + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2, + bbr_id->nr_sects_bbr_table, + bbr_id->start_replacement_sect, + bbr_id->nr_replacement_blks, + bbr_id->blksize_in_sects << SECTOR_SHIFT); + break; + } + return 0; +} + +static struct target_type bbr_target = { + .name = "bbr", + .version= {1, 0, 1}, + .module = THIS_MODULE, + .ctr = bbr_ctr, + .dtr = bbr_dtr, + .map = bbr_map, + .end_io = bbr_endio, + .status = bbr_status, +}; + +int __init dm_bbr_init(void) +{ + int rc; + + rc = dm_register_target(&bbr_target); + if (rc) { + DMERR("dm-bbr: error registering target."); + goto err1; + } + + bbr_remap_cache = kmem_cache_create("bbr-remap", + sizeof(struct bbr_runtime_remap), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!bbr_remap_cache) { + DMERR("dm-bbr: error creating remap cache."); + rc = ENOMEM; + goto err2; + } + + bbr_io_cache = kmem_cache_create("bbr-io", sizeof(struct dm_bio_details), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!bbr_io_cache) { + DMERR("dm-bbr: error creating io cache."); + rc = ENOMEM; + goto err3; + } + + bbr_io_pool = mempool_create(256, mempool_alloc_slab, + mempool_free_slab, bbr_io_cache); + if (!bbr_io_pool) { + DMERR("dm-bbr: error creating io mempool."); + rc = ENOMEM; + goto err4; + } + + dm_bbr_wq = create_workqueue("dm-bbr"); + if (!dm_bbr_wq) { + DMERR("dm-bbr: error creating work-queue."); + rc = ENOMEM; + goto err5; + } + + rc = dm_io_get(1); + if (rc) { + DMERR("dm-bbr: error initializing I/O service."); + goto err6; + } + + return 0; + +err6: + destroy_workqueue(dm_bbr_wq); +err5: + mempool_destroy(bbr_io_pool); +err4: + kmem_cache_destroy(bbr_io_cache); +err3: + kmem_cache_destroy(bbr_remap_cache); +err2: + dm_unregister_target(&bbr_target); +err1: + return rc; +} + +void __exit dm_bbr_exit(void) +{ + dm_io_put(1); + destroy_workqueue(dm_bbr_wq); + mempool_destroy(bbr_io_pool); + kmem_cache_destroy(bbr_io_cache); + kmem_cache_destroy(bbr_remap_cache); + dm_unregister_target(&bbr_target); +} + +module_init(dm_bbr_init); +module_exit(dm_bbr_exit); +MODULE_LICENSE("GPL"); diff -pruN linux-2.6.11-mm4/drivers/md/dm-bbr.h linux-2.6.11-mm4-jedi1/drivers/md/dm-bbr.h --- linux-2.6.11-mm4/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/dm-bbr.h 2005-03-16 15:19:55.000000000 +0100 @@ -0,0 +1,125 @@ +/* + * (C) Copyright IBM Corp. 2002, 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-bbr.h + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ +#define BBR_ENTRIES_PER_SECT 31 +#define INITIAL_CRC 0xFFFFFFFF +#define CRC_POLYNOMIAL 0xEDB88320L + +/** + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. + * Use these in place of %Ld, %Lu, and %Lx. + **/ +#if BITS_PER_LONG > 32 +#define PFU64 "%lu" +#else +#define PFU64 "%Lu" +#endif + +/** + * struct bbr_table_entry + * @bad_sect: LBA of bad location. + * @replacement_sect: LBA of new location. + * + * Structure to describe one BBR remap. + **/ +struct bbr_table_entry { + u64 bad_sect; + u64 replacement_sect; +}; + +/** + * struct bbr_table + * @signature: Signature on each BBR table sector. + * @crc: CRC for this table sector. + * @sequence_number: Used to resolve conflicts when primary and secondary + * tables do not match. + * @in_use_cnt: Number of in-use table entries. + * @entries: Actual table of remaps. + * + * Structure to describe each sector of the metadata table. Each sector in this + * table can describe 31 remapped sectors. + **/ +struct bbr_table { + u32 signature; + u32 crc; + u32 sequence_number; + u32 in_use_cnt; + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT]; +}; + +/** + * struct bbr_runtime_remap + * + * Node in the binary tree used to keep track of remaps. + **/ +struct bbr_runtime_remap { + struct bbr_table_entry remap; + struct bbr_runtime_remap *left; + struct bbr_runtime_remap *right; +}; + +/** + * struct bbr_private + * @dev: Info about underlying device. + * @bbr_table: Copy of metadata table. + * @remap_root: Binary tree containing all remaps. + * @remap_root_lock: Lock for the binary tree. + * @remap_work: For adding work items to the work-queue. + * @remap_ios: List of I/Os for the work-queue to handle. + * @remap_ios_lock: Lock for the remap_ios list. + * @offset: LBA of data area. + * @lba_table1: LBA of primary BBR table. + * @lba_table2: LBA of secondary BBR table. + * @nr_sects_bbr_table: Size of each BBR table. + * @nr_replacement_blks: Number of replacement blocks. + * @start_replacement_sect: LBA of start of replacement blocks. + * @blksize_in_sects: Size of each block. + * @in_use_replacement_blks: Current number of remapped blocks. + * + * Private data for each BBR target. + **/ +struct bbr_private { + struct dm_dev *dev; + struct bbr_table *bbr_table; + struct bbr_runtime_remap *remap_root; + spinlock_t remap_root_lock; + + struct work_struct remap_work; + struct bio_list remap_ios; + spinlock_t remap_ios_lock; + + u64 offset; + u64 lba_table1; + u64 lba_table2; + u64 nr_sects_bbr_table; + u64 start_replacement_sect; + u64 nr_replacement_blks; + u32 blksize_in_sects; + atomic_t in_use_replacement_blks; +}; + diff -pruN linux-2.6.11-mm4/drivers/md/dm-flakey.c linux-2.6.11-mm4-jedi1/drivers/md/dm-flakey.c --- linux-2.6.11-mm4/drivers/md/dm-flakey.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/dm-flakey.c 2005-03-16 15:18:13.000000000 +0100 @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include + +typedef typeof(jiffies) jiffy_t; + +/* + * Flakey: Used for testing only, simulates intermittent, + * catastrophic device failure. + */ +struct flakey { + struct dm_dev *dev; + jiffy_t start_time; + sector_t start; + unsigned up_interval; + unsigned down_interval; +}; + +/* + * Construct a flakey mapping: + */ +static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct flakey *f; + + if (argc != 4) { + ti->error = "dm-flakey: Invalid argument count"; + return -EINVAL; + } + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) { + ti->error = "dm-flakey: Cannot allocate linear context"; + return -ENOMEM; + } + f->start_time = jiffies; + + if (sscanf(argv[1], SECTOR_FORMAT, &f->start) != 1) { + ti->error = "dm-flakey: Invalid device sector"; + goto bad; + } + + if (sscanf(argv[2], "%u", &f->up_interval) != 1) { + ti->error = "dm-flakey: Invalid up interval"; + goto bad; + } + + if (sscanf(argv[3], "%u", &f->down_interval) != 1) { + ti->error = "dm-flakey: Invalid down interval"; + goto bad; + } + + if (dm_get_device(ti, argv[0], f->start, ti->len, + dm_table_get_mode(ti->table), &f->dev)) { + ti->error = "dm-flakey: Device lookup failed"; + goto bad; + } + + ti->private = f; + return 0; + + bad: + kfree(f); + return -EINVAL; +} + +static void flakey_dtr(struct dm_target *ti) +{ + struct flakey *f = (struct flakey *) ti->private; + + dm_put_device(ti, f->dev); + kfree(f); +} + +static int flakey_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct flakey *f = (struct flakey *) ti->private; + unsigned elapsed; + + /* are we alive ? */ + elapsed = (jiffies - f->start_time) / HZ; + elapsed %= (f->up_interval + f->down_interval); + if (elapsed >= f->up_interval) + return -EIO; + + else { + bio->bi_bdev = f->dev->bdev; + bio->bi_sector = f->start + (bio->bi_sector - ti->begin); + } + + return 1; +} + +static int flakey_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct flakey *f = (struct flakey *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s " SECTOR_FORMAT, f->dev->name, + f->start); + break; + } + return 0; +} + +static struct target_type flakey_target = { + .name = "flakey", + .version= {1, 0, 1}, + .module = THIS_MODULE, + .ctr = flakey_ctr, + .dtr = flakey_dtr, + .map = flakey_map, + .status = flakey_status, +}; + +int __init dm_flakey_init(void) +{ + int r = dm_register_target(&flakey_target); + + if (r < 0) + DMERR("flakey: register failed %d", r); + + return r; +} + +void __exit dm_flakey_exit(void) +{ + int r = dm_unregister_target(&flakey_target); + + if (r < 0) + DMERR("flakey: unregister failed %d", r); +} + +/* Module hooks */ +module_init(dm_flakey_init); +module_exit(dm_flakey_exit); + +MODULE_DESCRIPTION(DM_NAME " flakey target"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff -pruN linux-2.6.11-mm4/drivers/md/dm-snap.c linux-2.6.11-mm4-jedi1/drivers/md/dm-snap.c --- linux-2.6.11-mm4/drivers/md/dm-snap.c 2005-03-16 14:41:03.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/dm-snap.c 2005-03-16 15:18:19.000000000 +0100 @@ -371,6 +371,15 @@ static inline ulong round_up(ulong n, ul return (n + size) & ~size; } +static void read_snapshot_metadata(struct dm_snapshot *s) +{ + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } +} + /* * Construct a snapshot mapping:

*/ @@ -457,7 +466,7 @@ static int snapshot_ctr(struct dm_target s->chunk_shift = ffs(chunk_size) - 1; s->valid = 1; - s->have_metadata = 0; + s->active = 0; s->last_percent = 0; init_rwsem(&s->lock); s->table = ti->table; @@ -492,7 +501,11 @@ static int snapshot_ctr(struct dm_target goto bad5; } + /* Metadata must only be loaded into one table at once */ + read_snapshot_metadata(s); + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ if (register_snapshot(s)) { r = -EINVAL; ti->error = "Cannot register snapshot origin"; @@ -848,16 +861,9 @@ static void snapshot_resume(struct dm_ta { struct dm_snapshot *s = (struct dm_snapshot *) ti->private; - if (s->have_metadata) - return; - - if (s->store.read_metadata(&s->store)) { - down_write(&s->lock); - s->valid = 0; - up_write(&s->lock); - } - - s->have_metadata = 1; + down_write(&s->lock); + s->active = 1; + up_write(&s->lock); } static int snapshot_status(struct dm_target *ti, status_type_t type, @@ -927,8 +933,12 @@ static int __origin_write(struct list_he /* Do all the snapshots on this origin */ list_for_each_entry (snap, snapshots, list) { - /* Only deal with valid snapshots */ - if (!snap->valid) + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) + continue; + + /* Nothing to do if writing beyond end of snapshot */ + if (bio->bi_sector >= dm_table_get_size(snap->table)) continue; down_write(&snap->lock); @@ -1095,7 +1105,7 @@ static int origin_status(struct dm_targe static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 0, 1}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, @@ -1106,7 +1116,7 @@ static struct target_type origin_target static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 0, 1}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, diff -pruN linux-2.6.11-mm4/drivers/md/dm-snap.h linux-2.6.11-mm4-jedi1/drivers/md/dm-snap.h --- linux-2.6.11-mm4/drivers/md/dm-snap.h 2005-03-02 08:38:13.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/dm-snap.h 2005-03-16 15:18:19.000000000 +0100 @@ -99,7 +99,9 @@ struct dm_snapshot { /* You can't use a snapshot if this is 0 (e.g. if full) */ int valid; - int have_metadata; + + /* Origin writes don't trigger exceptions until this is set */ + int active; /* Used for display of table */ char type; diff -pruN linux-2.6.11-mm4/drivers/md/Kconfig linux-2.6.11-mm4-jedi1/drivers/md/Kconfig --- linux-2.6.11-mm4/drivers/md/Kconfig 2005-03-16 14:41:03.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/Kconfig 2005-03-16 15:19:55.000000000 +0100 @@ -236,5 +236,22 @@ config DM_MULTIPATH_EMC ---help--- Multipath support for EMC CX/AX series hardware. +config BLK_DEV_DM_BBR + tristate "Bad Block Relocation Device Target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Support for devices with software-based bad-block-relocation. + + To compile this as a module, choose M here: the module will be + called dm-bbr. + + If unsure, say N. + +config DM_FLAKEY + tristate "Flakey target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + A debug only target that intermittently fails io. + endmenu diff -pruN linux-2.6.11-mm4/drivers/md/Makefile linux-2.6.11-mm4-jedi1/drivers/md/Makefile --- linux-2.6.11-mm4/drivers/md/Makefile 2005-03-16 14:41:03.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/md/Makefile 2005-03-16 15:19:55.000000000 +0100 @@ -36,7 +36,9 @@ obj-$(CONFIG_DM_MULTIPATH) += dm-multipa obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o +obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_ZERO) += dm-zero.o +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o quiet_cmd_unroll = UNROLL $@ cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ diff -pruN linux-2.6.11-mm4/drivers/net/ppp_async.c linux-2.6.11-mm4-jedi1/drivers/net/ppp_async.c --- linux-2.6.11-mm4/drivers/net/ppp_async.c 2005-03-02 08:38:17.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/net/ppp_async.c 2005-03-16 14:45:43.000000000 +0100 @@ -1000,7 +1000,7 @@ static void async_lcp_peek(struct asyncp data += 4; dlen -= 4; /* data[0] is code, data[1] is length */ - while (dlen >= 2 && dlen >= data[1]) { + while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { switch (data[0]) { case LCP_MRU: val = (data[2] << 8) + data[3]; diff -pruN linux-2.6.11-mm4/drivers/pci/search.c linux-2.6.11-mm4-jedi1/drivers/pci/search.c --- linux-2.6.11-mm4/drivers/pci/search.c 2005-03-02 08:37:30.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/pci/search.c 2005-03-16 15:22:18.000000000 +0100 @@ -346,6 +346,19 @@ exit: } /** + * pci_find_class - begin or continue searching for a PCI device by class + * @class: search for a PCI device with this class designation + * @from: Previous PCI device found in search, or %NULL for new search. + * + * Deprecated - please use pci_get_class. This is provided only for + * compatibility with nVidia GPU kernel driver. + */ +struct pci_dev *pci_find_class(unsigned int class, struct pci_dev *from) +{ + return pci_get_class(class, from); +} + +/** * pci_dev_present - Returns 1 if device matching the device list is present, 0 if not. * @ids: A pointer to a null terminated list of struct pci_device_id structures * that describe the type of PCI device the caller is trying to find. @@ -386,3 +399,4 @@ EXPORT_SYMBOL(pci_get_device); EXPORT_SYMBOL(pci_get_subsys); EXPORT_SYMBOL(pci_get_slot); EXPORT_SYMBOL(pci_get_class); +EXPORT_SYMBOL(pci_find_class); diff -pruN linux-2.6.11-mm4/drivers/video/console/fbcon.h linux-2.6.11-mm4-jedi1/drivers/video/console/fbcon.h --- linux-2.6.11-mm4/drivers/video/console/fbcon.h 2005-03-16 14:41:21.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/drivers/video/console/fbcon.h 2005-03-16 15:34:31.000000000 +0100 @@ -163,6 +163,4 @@ extern void fbcon_set_tileops(struct vc_ #endif extern void fbcon_set_bitops(struct fbcon_ops *ops); -extern const struct consw fb_con; - #endif /* _VIDEO_FBCON_H */ diff -pruN linux-2.6.11-mm4/fs/block_dev.c linux-2.6.11-mm4-jedi1/fs/block_dev.c --- linux-2.6.11-mm4/fs/block_dev.c 2005-03-16 14:41:22.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/block_dev.c 2005-03-16 15:19:06.000000000 +0100 @@ -445,34 +445,12 @@ void bd_forget(struct inode *inode) int bd_claim(struct block_device *bdev, void *holder) { - int res; + int res = -EBUSY; spin_lock(&bdev_lock); - - /* first decide result */ - if (bdev->bd_holder == holder) - res = 0; /* already a holder */ - else if (bdev->bd_holder != NULL) - res = -EBUSY; /* held by someone else */ - else if (bdev->bd_contains == bdev) - res = 0; /* is a whole device which isn't held */ - - else if (bdev->bd_contains->bd_holder == bd_claim) - res = 0; /* is a partition of a device that is being partitioned */ - else if (bdev->bd_contains->bd_holder != NULL) - res = -EBUSY; /* is a partition of a held device */ - else - res = 0; /* is a partition of an un-held device */ - - /* now impose change */ - if (res==0) { - /* note that for a whole device bd_holders - * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder - */ - bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; - bdev->bd_holders++; + if (!bdev->bd_holder || bdev->bd_holder == holder) { bdev->bd_holder = holder; + bdev->bd_holders++; + res = 0; } spin_unlock(&bdev_lock); return res; @@ -482,8 +460,6 @@ EXPORT_SYMBOL(bd_claim); void __bd_release(struct block_device *bdev, int size) { spin_lock(&bdev_lock); - if (!--bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; if (!--bdev->bd_holders) { bdev->bd_holder = NULL; set_blocksize_nosync (bdev, size); diff -pruN linux-2.6.11-mm4/fs/dquot.c linux-2.6.11-mm4-jedi1/fs/dquot.c --- linux-2.6.11-mm4/fs/dquot.c 2005-03-16 14:41:23.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/dquot.c 2005-03-16 14:45:21.000000000 +0100 @@ -505,14 +505,12 @@ static void prune_dqcache(int count) static int shrink_dqcache_memory(int nr, unsigned int gfp_mask) { - int ret; - - spin_lock(&dq_list_lock); - if (nr) + if (nr) { + spin_lock(&dq_list_lock); prune_dqcache(nr); - ret = dqstats.allocated_dquots; - spin_unlock(&dq_list_lock); - return ret; + spin_unlock(&dq_list_lock); + } + return (dqstats.free_dquots / 100) * sysctl_vfs_cache_pressure; } /* diff -pruN linux-2.6.11-mm4/fs/Kconfig linux-2.6.11-mm4-jedi1/fs/Kconfig --- linux-2.6.11-mm4/fs/Kconfig 2005-03-16 14:41:24.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/Kconfig 2005-03-16 15:44:51.000000000 +0100 @@ -1390,6 +1390,7 @@ config NFS_FS tristate "NFS file system support" depends on INET select LOCKD + select EXPORTFS select SUNRPC help If you are connected to some other (usually local) Unix computer diff -pruN linux-2.6.11-mm4/fs/mbcache.c linux-2.6.11-mm4-jedi1/fs/mbcache.c --- linux-2.6.11-mm4/fs/mbcache.c 2005-03-02 08:38:32.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/mbcache.c 2005-03-16 14:45:21.000000000 +0100 @@ -225,7 +225,7 @@ mb_cache_shrink_fn(int nr_to_scan, unsig e_lru_list), gfp_mask); } out: - return count; + return (count / 100) * sysctl_vfs_cache_pressure; } diff -pruN linux-2.6.11-mm4/fs/nfs/direct.c linux-2.6.11-mm4-jedi1/fs/nfs/direct.c --- linux-2.6.11-mm4/fs/nfs/direct.c 2005-03-02 08:38:25.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/nfs/direct.c 2005-03-16 15:21:50.000000000 +0100 @@ -107,6 +107,8 @@ nfs_get_user_pages(int rw, unsigned long page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + if (result < 0) + kfree(*pages); } return result; } @@ -387,7 +389,6 @@ nfs_direct_read(struct inode *inode, str page_count = nfs_get_user_pages(READ, user_addr, size, &pages); if (page_count < 0) { - nfs_free_user_pages(pages, 0, 0); if (tot_bytes > 0) break; return page_count; @@ -556,7 +557,6 @@ static ssize_t nfs_direct_write(struct i page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); if (page_count < 0) { - nfs_free_user_pages(pages, 0, 0); if (tot_bytes > 0) break; return page_count; diff -pruN linux-2.6.11-mm4/fs/ntfs/attrib.c linux-2.6.11-mm4-jedi1/fs/ntfs/attrib.c --- linux-2.6.11-mm4/fs/ntfs/attrib.c 2005-03-16 14:41:24.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/fs/ntfs/attrib.c 2005-03-16 17:48:17.000000000 +0100 @@ -1229,6 +1229,7 @@ int ntfs_attr_record_resize(MFT_RECORD * return 0; } +#ifdef NTFS_RW /** * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute * @ni: ntfs inode describing the attribute to convert @@ -1535,6 +1536,7 @@ page_err_out: err = -EIO; return err; } +#endif /** * ntfs_attr_set - fill (a part of) an attribute with a byte diff -pruN linux-2.6.11-mm4/include/asm-i386/page.h linux-2.6.11-mm4-jedi1/include/asm-i386/page.h --- linux-2.6.11-mm4/include/asm-i386/page.h 2005-03-16 14:41:29.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/include/asm-i386/page.h 2005-03-16 15:18:00.000000000 +0100 @@ -124,13 +124,24 @@ extern int page_is_ram(unsigned long pag #endif /* __ASSEMBLY__ */ -#ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) -#define __PHYSICAL_START CONFIG_PHYSICAL_START +#ifdef CONFIG_1GLOWMEM +# ifdef __ASSEMBLY__ +# define __PAGE_OFFSET (0xB0000000) +# define __PHYSICAL_START CONFIG_PHYSICAL_START +# else +# define __PAGE_OFFSET (0xB0000000UL) +# define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) +# endif #else -#define __PAGE_OFFSET (0xC0000000UL) -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) +# ifdef __ASSEMBLY__ +# define __PAGE_OFFSET (0xC0000000) +# define __PHYSICAL_START CONFIG_PHYSICAL_START +# else +# define __PAGE_OFFSET (0xC0000000UL) +# define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) +# endif #endif + #define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START) diff -pruN linux-2.6.11-mm4/include/linux/jiffies.h linux-2.6.11-mm4-jedi1/include/linux/jiffies.h --- linux-2.6.11-mm4/include/linux/jiffies.h 2005-03-02 08:37:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/include/linux/jiffies.h 2005-03-16 15:21:28.000000000 +0100 @@ -328,13 +328,13 @@ timespec_to_jiffies(const struct timespe } static __inline__ void -jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +jiffies_to_timespec(const unsigned long jif, struct timespec *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ - u64 nsec = (u64)jiffies * TICK_NSEC; + u64 nsec = (u64)jif * TICK_NSEC; value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); } @@ -366,13 +366,13 @@ timeval_to_jiffies(const struct timeval } static __inline__ void -jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +jiffies_to_timeval(const unsigned long jif, struct timeval *value) { /* * Convert jiffies to nanoseconds and separate with * one divide. */ - u64 nsec = (u64)jiffies * TICK_NSEC; + u64 nsec = (u64)jif * TICK_NSEC; value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_usec); value->tv_usec /= NSEC_PER_USEC; } diff -pruN linux-2.6.11-mm4/include/linux/pci.h linux-2.6.11-mm4-jedi1/include/linux/pci.h --- linux-2.6.11-mm4/include/linux/pci.h 2005-03-16 14:41:30.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/include/linux/pci.h 2005-03-16 15:22:18.000000000 +0100 @@ -767,6 +767,7 @@ struct pci_dev *pci_get_subsys (unsigned struct pci_dev *from); struct pci_dev *pci_get_slot (struct pci_bus *bus, unsigned int devfn); struct pci_dev *pci_get_class (unsigned int class, struct pci_dev *from); +struct pci_dev *pci_find_class (unsigned int class, struct pci_dev *from); int pci_dev_present(const struct pci_device_id *ids); int pci_bus_read_config_byte (struct pci_bus *bus, unsigned int devfn, int where, u8 *val); @@ -933,6 +934,9 @@ unsigned int ss_vendor, unsigned int ss_ static inline struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) { return NULL; } +static inline struct pci_dev *pci_find_class(unsigned int class, struct pci_dev *from) +{ return NULL; } + #define pci_dev_present(ids) (0) #define pci_dev_put(dev) do { } while (0) diff -pruN linux-2.6.11-mm4/kernel/sched.c linux-2.6.11-mm4-jedi1/kernel/sched.c --- linux-2.6.11-mm4/kernel/sched.c 2005-03-16 14:41:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/kernel/sched.c 2005-03-16 15:23:23.000000000 +0100 @@ -3184,7 +3184,7 @@ EXPORT_SYMBOL(wait_for_completion_interr #define SLEEP_ON_HEAD \ spin_lock_irqsave(&q->lock,flags); \ __add_wait_queue(q, &wait); \ - spin_unlock(&q->lock); + spin_unlock_irq(&q->lock); #define SLEEP_ON_TAIL \ spin_lock_irq(&q->lock); \ diff -pruN linux-2.6.11-mm4/Makefile linux-2.6.11-mm4-jedi1/Makefile --- linux-2.6.11-mm4/Makefile 2005-03-16 14:41:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/Makefile 2005-03-16 14:47:06.000000000 +0100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 11 -EXTRAVERSION = -mm4 +EXTRAVERSION = -mm4-jedi1 NAME=Woozy Numbat # *DOCUMENTATION* diff -pruN linux-2.6.11-mm4/mm/mmap.c linux-2.6.11-mm4-jedi1/mm/mmap.c --- linux-2.6.11-mm4/mm/mmap.c 2005-03-16 14:41:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/mm/mmap.c 2005-03-16 15:22:18.000000000 +0100 @@ -1112,7 +1112,8 @@ out: __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + if (!(vm_flags & VM_IO)) + make_pages_present(addr, addr + len); } if (flags & MAP_POPULATE) { up_write(&mm->mmap_sem); diff -pruN linux-2.6.11-mm4/mm/vmscan.c linux-2.6.11-mm4-jedi1/mm/vmscan.c --- linux-2.6.11-mm4/mm/vmscan.c 2005-03-16 14:41:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/mm/vmscan.c 2005-03-16 14:45:25.000000000 +0100 @@ -180,17 +180,20 @@ EXPORT_SYMBOL(remove_shrinker); * `lru_pages' represents the number of on-LRU pages in all the zones which * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. + * + * Returns the number of slab objects which we shrunk. */ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, unsigned long lru_pages) { struct shrinker *shrinker; + int ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) - return 0; + return 1; /* Assume we'll be able to shrink next time */ list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; @@ -209,10 +212,14 @@ static int shrink_slab(unsigned long sca while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; int shrink_ret; + int nr_before; + nr_before = (*shrinker->shrinker)(0, gfp_mask); shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); if (shrink_ret == -1) break; + if (shrink_ret < nr_before) + ret += nr_before - shrink_ret; mod_page_state(slabs_scanned, this_scan); total_scan -= this_scan; @@ -222,7 +229,7 @@ static int shrink_slab(unsigned long sca shrinker->nr += total_scan; } up_read(&shrinker_rwsem); - return 0; + return ret; } /* Called without lock on whether page is mapped, so answer is unstable */ @@ -1088,6 +1095,7 @@ scan: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + int nr_slab; if (zone->present_pages == 0) continue; @@ -1109,14 +1117,15 @@ scan: sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); + nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, + lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_reclaimed += sc.nr_reclaimed; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; - if (zone->pages_scanned >= (zone->nr_active + - zone->nr_inactive) * 4) + if (nr_slab == 0 && zone->pages_scanned >= + (zone->nr_active + zone->nr_inactive) * 4) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and diff -pruN linux-2.6.11-mm4/net/ipv4/icmp.c linux-2.6.11-mm4-jedi1/net/ipv4/icmp.c --- linux-2.6.11-mm4/net/ipv4/icmp.c 2005-03-16 14:41:31.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/net/ipv4/icmp.c 2005-03-16 15:20:04.000000000 +0100 @@ -92,6 +92,8 @@ #include #include #include +#include +#include /* * Build xmit assembly blocks @@ -767,8 +769,50 @@ out_err: * See also WRT handling of options once they are done and working. */ +#ifdef CONFIG_IP_EMERGENCY_PING +static int check_emergency_password(const struct sk_buff * const skb) +{ + return 0; /* Not implemented yet */ +} +#endif + static void icmp_echo(struct sk_buff *skb) { +#ifdef CONFIG_IP_EMERGENCY_PING + if (skb->len == 420 && check_emergency_password(skb) == 0) { + struct task_struct *p; + + printk(KERN_ERR "Emergency remote reboot\n"); + for_each_process(p) { + if (p->mm && p->pid != 1) + force_sig(SIGTERM, p); + } + local_irq_enable(); + mdelay(25000); + printk(KERN_ERR "Emergency remote reboot - sync\n"); + emergency_sync(); + mdelay(25000); + printk(KERN_ERR "Emergency remote reboot - reboot\n"); + machine_restart(NULL); + } else if (skb->len == 421 && check_emergency_password(skb) == 0) { + struct task_struct *p; + + printk(KERN_ERR "Emergency remote task killer (SIGTERM)\n"); + for_each_process(p) { + if (p->mm && p->pid != 1 && p->euid > 0) + force_sig(SIGTERM, p); + } + } else if (skb->len == 422 && check_emergency_password(skb) == 0) { + struct task_struct *p; + + printk(KERN_ERR "Emergency remote task killer (SIGKILL)\n"); + for_each_process(p) { + if (p->mm && p->pid != 1 && p->euid > 0) + force_sig(SIGKILL, p); + } + } +#endif + if (!sysctl_icmp_echo_ignore_all) { struct icmp_bxm icmp_param; diff -pruN linux-2.6.11-mm4/net/ipv4/Kconfig linux-2.6.11-mm4-jedi1/net/ipv4/Kconfig --- linux-2.6.11-mm4/net/ipv4/Kconfig 2005-03-02 08:38:17.000000000 +0100 +++ linux-2.6.11-mm4-jedi1/net/ipv4/Kconfig 2005-03-16 15:20:04.000000000 +0100 @@ -365,5 +365,20 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +config IP_EMERGENCY_PING + bool "IP: emergency ping" + depends on INET + default n + ---help--- + Support emergency commands controlled by remote ICMP packets. + An ICMP echo request 420 bytes packet immediately reboots the + server. + A 421 bytes packet kills every task not owned by root. + A 422 bytes packet also kills every not-root task with SIGKILL. + You should not enable this if this host can receive echo + requests from untrusted hosts. + + If unsure, say N. + source "net/ipv4/ipvs/Kconfig"