| /* |
| * Workingset detection |
| * |
| * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner |
| */ |
| |
| #include <linux/memcontrol.h> |
| #include <linux/writeback.h> |
| #include <linux/pagemap.h> |
| #include <linux/atomic.h> |
| #include <linux/module.h> |
| #include <linux/swap.h> |
| #include <linux/fs.h> |
| #include <linux/mm.h> |
| |
| /* |
| * Double CLOCK lists |
| * |
| * Per zone, two clock lists are maintained for file pages: the |
| * inactive and the active list. Freshly faulted pages start out at |
| * the head of the inactive list and page reclaim scans pages from the |
| * tail. Pages that are accessed multiple times on the inactive list |
| * are promoted to the active list, to protect them from reclaim, |
| * whereas active pages are demoted to the inactive list when the |
| * active list grows too big. |
| * |
| * fault ------------------------+ |
| * | |
| * +--------------+ | +-------------+ |
| * reclaim <- | inactive | <-+-- demotion | active | <--+ |
| * +--------------+ +-------------+ | |
| * | | |
| * +-------------- promotion ------------------+ |
| * |
| * |
| * Access frequency and refault distance |
| * |
| * A workload is thrashing when its pages are frequently used but they |
| * are evicted from the inactive list every time before another access |
| * would have promoted them to the active list. |
| * |
| * In cases where the average access distance between thrashing pages |
| * is bigger than the size of memory there is nothing that can be |
| * done - the thrashing set could never fit into memory under any |
| * circumstance. |
| * |
| * However, the average access distance could be bigger than the |
| * inactive list, yet smaller than the size of memory. In this case, |
| * the set could fit into memory if it weren't for the currently |
| * active pages - which may be used more, hopefully less frequently: |
| * |
| * +-memory available to cache-+ |
| * | | |
| * +-inactive------+-active----+ |
| * a b | c d e f g h i | J K L M N | |
| * +---------------+-----------+ |
| * |
| * It is prohibitively expensive to accurately track access frequency |
| * of pages. But a reasonable approximation can be made to measure |
| * thrashing on the inactive list, after which refaulting pages can be |
| * activated optimistically to compete with the existing active pages. |
| * |
| * Approximating inactive page access frequency - Observations: |
| * |
| * 1. When a page is accessed for the first time, it is added to the |
| * head of the inactive list, slides every existing inactive page |
| * towards the tail by one slot, and pushes the current tail page |
| * out of memory. |
| * |
| * 2. When a page is accessed for the second time, it is promoted to |
| * the active list, shrinking the inactive list by one slot. This |
| * also slides all inactive pages that were faulted into the cache |
| * more recently than the activated page towards the tail of the |
| * inactive list. |
| * |
| * Thus: |
| * |
| * 1. The sum of evictions and activations between any two points in |
| * time indicate the minimum number of inactive pages accessed in |
| * between. |
| * |
| * 2. Moving one inactive page N page slots towards the tail of the |
| * list requires at least N inactive page accesses. |
| * |
| * Combining these: |
| * |
| * 1. When a page is finally evicted from memory, the number of |
| * inactive pages accessed while the page was in cache is at least |
| * the number of page slots on the inactive list. |
| * |
| * 2. In addition, measuring the sum of evictions and activations (E) |
| * at the time of a page's eviction, and comparing it to another |
| * reading (R) at the time the page faults back into memory tells |
| * the minimum number of accesses while the page was not cached. |
| * This is called the refault distance. |
| * |
| * Because the first access of the page was the fault and the second |
| * access the refault, we combine the in-cache distance with the |
| * out-of-cache distance to get the complete minimum access distance |
| * of this page: |
| * |
| * NR_inactive + (R - E) |
| * |
| * And knowing the minimum access distance of a page, we can easily |
| * tell if the page would be able to stay in cache assuming all page |
| * slots in the cache were available: |
| * |
| * NR_inactive + (R - E) <= NR_inactive + NR_active |
| * |
| * which can be further simplified to |
| * |
| * (R - E) <= NR_active |
| * |
| * Put into words, the refault distance (out-of-cache) can be seen as |
| * a deficit in inactive list space (in-cache). If the inactive list |
| * had (R - E) more page slots, the page would not have been evicted |
| * in between accesses, but activated instead. And on a full system, |
| * the only thing eating into inactive list space is active pages. |
| * |
| * |
| * Activating refaulting pages |
| * |
| * All that is known about the active list is that the pages have been |
| * accessed more than once in the past. This means that at any given |
| * time there is actually a good chance that pages on the active list |
| * are no longer in active use. |
| * |
| * So when a refault distance of (R - E) is observed and there are at |
| * least (R - E) active pages, the refaulting page is activated |
| * optimistically in the hope that (R - E) active pages are actually |
| * used less frequently than the refaulting page - or even not used at |
| * all anymore. |
| * |
| * If this is wrong and demotion kicks in, the pages which are truly |
| * used more frequently will be reactivated while the less frequently |
| * used once will be evicted from memory. |
| * |
| * But if this is right, the stale pages will be pushed out of memory |
| * and the used pages get to stay in cache. |
| * |
| * |
| * Implementation |
| * |
| * For each zone's file LRU lists, a counter for inactive evictions |
| * and activations is maintained (zone->inactive_age). |
| * |
| * On eviction, a snapshot of this counter (along with some bits to |
| * identify the zone) is stored in the now empty page cache radix tree |
| * slot of the evicted page. This is called a shadow entry. |
| * |
| * On cache misses for which there are shadow entries, an eligible |
| * refault distance will immediately activate the refaulting page. |
| */ |
| |
| static void *pack_shadow(unsigned long eviction, struct zone *zone) |
| { |
| eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); |
| eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); |
| eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); |
| |
| return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); |
| } |
| |
| static void unpack_shadow(void *shadow, |
| struct zone **zone, |
| unsigned long *distance) |
| { |
| unsigned long entry = (unsigned long)shadow; |
| unsigned long eviction; |
| unsigned long refault; |
| unsigned long mask; |
| int zid, nid; |
| |
| entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; |
| zid = entry & ((1UL << ZONES_SHIFT) - 1); |
| entry >>= ZONES_SHIFT; |
| nid = entry & ((1UL << NODES_SHIFT) - 1); |
| entry >>= NODES_SHIFT; |
| eviction = entry; |
| |
| *zone = NODE_DATA(nid)->node_zones + zid; |
| |
| refault = atomic_long_read(&(*zone)->inactive_age); |
| mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + |
| RADIX_TREE_EXCEPTIONAL_SHIFT); |
| /* |
| * The unsigned subtraction here gives an accurate distance |
| * across inactive_age overflows in most cases. |
| * |
| * There is a special case: usually, shadow entries have a |
| * short lifetime and are either refaulted or reclaimed along |
| * with the inode before they get too old. But it is not |
| * impossible for the inactive_age to lap a shadow entry in |
| * the field, which can then can result in a false small |
| * refault distance, leading to a false activation should this |
| * old entry actually refault again. However, earlier kernels |
| * used to deactivate unconditionally with *every* reclaim |
| * invocation for the longest time, so the occasional |
| * inappropriate activation leading to pressure on the active |
| * list is not a problem. |
| */ |
| *distance = (refault - eviction) & mask; |
| } |
| |
| /** |
| * workingset_eviction - note the eviction of a page from memory |
| * @mapping: address space the page was backing |
| * @page: the page being evicted |
| * |
| * Returns a shadow entry to be stored in @mapping->page_tree in place |
| * of the evicted @page so that a later refault can be detected. |
| */ |
| void *workingset_eviction(struct address_space *mapping, struct page *page) |
| { |
| struct zone *zone = page_zone(page); |
| unsigned long eviction; |
| |
| eviction = atomic_long_inc_return(&zone->inactive_age); |
| return pack_shadow(eviction, zone); |
| } |
| |
| /** |
| * workingset_refault - evaluate the refault of a previously evicted page |
| * @shadow: shadow entry of the evicted page |
| * |
| * Calculates and evaluates the refault distance of the previously |
| * evicted page in the context of the zone it was allocated in. |
| * |
| * Returns %true if the page should be activated, %false otherwise. |
| */ |
| bool workingset_refault(void *shadow) |
| { |
| unsigned long refault_distance; |
| struct zone *zone; |
| |
| unpack_shadow(shadow, &zone, &refault_distance); |
| inc_zone_state(zone, WORKINGSET_REFAULT); |
| |
| if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { |
| inc_zone_state(zone, WORKINGSET_ACTIVATE); |
| return true; |
| } |
| return false; |
| } |
| |
| /** |
| * workingset_activation - note a page activation |
| * @page: page that is being activated |
| */ |
| void workingset_activation(struct page *page) |
| { |
| atomic_long_inc(&page_zone(page)->inactive_age); |
| } |