diff options
Diffstat (limited to 'meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp')
-rw-r--r-- | meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp | 851 |
1 files changed, 851 insertions, 0 deletions
diff --git a/meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp b/meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp new file mode 100644 index 0000000..24e107c --- /dev/null +++ b/meta-facebook/meta-wedge/recipes-wedge/fan-ctrl/fan-ctrl/fand.cpp @@ -0,0 +1,851 @@ +/* + * fand + * + * Copyright 2014-present Facebook. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Daemon to manage the fan speed to ensure that we stay within a reasonable + * temperature range. We're using a simplistic algorithm to get started: + * + * If the fan is already on high, we'll move it to medium if we fall below + * a top temperature. If we're on medium, we'll move it to high + * if the temperature goes over the top value, and to low if the + * temperature falls to a bottom level. If the fan is on low, + * we'll increase the speed if the temperature rises to the top level. + * + * To ensure that we're not just turning the fans up, then back down again, + * we'll require an extra few degrees of temperature drop before we lower + * the fan speed. + * + * We check the RPM of the fans against the requested RPMs to determine + * whether the fans are failing, in which case we'll turn up all of + * the other fans and report the problem.. + * + * TODO: Implement a PID algorithm to closely track the ideal temperature. + * TODO: Determine if the daemon is already started. + */ + +/* Yeah, the file ends in .cpp, but it's a C program. Deal. */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <signal.h> +#include <syslog.h> +#include "watchdog.h" + +#include "facebook/wedge_eeprom.h" + +/* Sensor definitions */ + +#define INTERNAL_TEMPS(x) ((x) * 1000) // stored a C * 1000 +#define EXTERNAL_TEMPS(x) ((x) / 1000) + +#define I2C_BUS_3_DIR "/sys/class/i2c-adapter/i2c-3/" +#define I2C_BUS_4_DIR "/sys/class/i2c-adapter/i2c-4/" + +#define INTAKE_TEMP_DEVICE I2C_BUS_3_DIR "3-0048" +#define T2_TEMP_DEVICE I2C_BUS_3_DIR "3-0049" +#define EXHAUST_TEMP_DEVICE I2C_BUS_3_DIR "3-004a" +#define USERVER_TEMP_DEVICE I2C_BUS_4_DIR "4-0040" + +/* + * The sensor for the uServer CPU is not on the CPU itself, so it reads + * a little low. We are special casing this, but we should obviously + * be thinking about a way to generalize these tweaks, and perhaps + * the entire configuration. JSON file? + */ + +#define USERVER_TEMP_FUDGE INTERNAL_TEMPS(10) + +#define BAD_TEMP INTERNAL_TEMPS(-60) + +#define BAD_READ_THRESHOLD 4 /* How many times can reads fail */ +#define FAN_FAILURE_THRESHOLD 4 /* How many times can a fan fail */ + +#define PWM_DIR "/sys/devices/platform/ast_pwm_tacho.0" + +#define PWM_UNIT_MAX 96 + +#define LARGEST_DEVICE_NAME 120 + +#define GPIO_USERVER_POWER_DIRECTION "/sys/class/gpio/gpio25/direction" +#define GPIO_USERVER_POWER "/sys/class/gpio/gpio25/value" +#define GPIO_T2_POWER_DIRECTION "/sys/class/gpio/gpio41/direction" +#define GPIO_T2_POWER "/sys/class/gpio/gpio41/value" + +#define GPIO_FAN0_LED "/sys/class/gpio/gpio53/value" +#define GPIO_FAN1_LED "/sys/class/gpio/gpio54/value" +#define GPIO_FAN2_LED "/sys/class/gpio/gpio55/value" +#define GPIO_FAN3_LED "/sys/class/gpio/gpio72/value" + +const char *fan_led[] = {GPIO_FAN0_LED, GPIO_FAN1_LED, + GPIO_FAN2_LED, GPIO_FAN3_LED}; + +#define FAN_LED_RED "0" +#define FAN_LED_BLUE "1" + +#define GPIO_PIN_ID "/sys/class/gpio/gpio%d/value" +#define REV_IDS 3 +#define GPIO_REV_ID_START 192 + +#define BOARD_IDS 4 +#define GPIO_BOARD_ID_START 160 + +/* + * With hardware revisions after 3, we use a different set of pins for + * the BOARD_ID. + */ + +#define REV_ID_NEW_BOARD_ID 3 +#define GPIO_BOARD_ID_START_NEW 166 + +#define REPORT_TEMP 720 /* Report temp every so many cycles */ + +/* Sensor limits and tuning parameters */ + +#define INTAKE_LIMIT INTERNAL_TEMPS(60) +#define T2_LIMIT INTERNAL_TEMPS(95) +#define USERVER_LIMIT INTERNAL_TEMPS(75) + +#define TEMP_TOP INTERNAL_TEMPS(70) +#define TEMP_BOTTOM INTERNAL_TEMPS(40) + +/* + * Toggling the fan constantly will wear it out (and annoy anyone who + * can hear it), so we'll only turn down the fan after the temperature + * has dipped a bit below the point at which we'd otherwise switch + * things up. + */ + +#define COOLDOWN_SLOP INTERNAL_TEMPS(6) + +#define FAN_LOW 35 +#define FAN_MEDIUM 50 +#define FAN_HIGH 70 +#define FAN_MAX 99 + +/* + * Mapping physical to hardware addresses for fans; it's different for + * RPM measuring and PWM setting, naturally. Doh. + */ + +int fan_to_rpm_map[] = {3, 2, 0, 1}; +int fan_to_pwm_map[] = {7, 6, 0, 1}; + +#define FANS 4 + +/* + * The measured RPM of the fans doesn't match linearly to the requested + * rate. In addition, there are coaxially mounted fans, so the rear fans + * feed into the front fans. The rear fans will run slower since they're + * grabbing still air, and the front fants are getting an extra boost. + * + * We'd like to measure the fan RPM and compare it to the expected RPM + * so that we can detect failed fans, so we have a table (derived from + * hardware testing): + */ + +struct rpm_to_pct_map { + ushort pct; + ushort rpm; +}; + +struct rpm_to_pct_map rpm_front_map[] = {{30, 6150}, + {35, 7208}, + {40, 8195}, + {45, 9133}, + {50, 10017}, + {55, 10847}, + {60, 11612}, + {65, 12342}, + {70, 13057}, + {75, 13717}, + {80, 14305}, + {85, 14869}, + {90, 15384}, + {95, 15871}, + {100, 16095}}; + +struct rpm_to_pct_map rpm_rear_map[] = {{30, 3911}, + {35, 4760}, + {40, 5587}, + {45, 6434}, + {50, 7295}, + {55, 8187}, + {60, 9093}, + {65, 10008}, + {70, 10949}, + {75, 11883}, + {80, 12822}, + {85, 13726}, + {90, 14690}, + {95, 15516}, + {100, 15897}}; + +#define FAN_FAILURE_OFFSET 30 + +int fan_low = FAN_LOW; +int fan_medium = FAN_MEDIUM; +int fan_high = FAN_HIGH; +int fan_max = FAN_MAX; +int total_fans = FANS; +int fan_offset = 0; + +int temp_bottom = TEMP_BOTTOM; +int temp_top = TEMP_TOP; + +int report_temp = REPORT_TEMP; +bool verbose = false; + +void usage() { + fprintf(stderr, + "fand [-v] [-l <low-pct>] [-m <medium-pct>] " + "[-h <high-pct>]\n" + "\t[-b <temp-bottom>] [-t <temp-top>] [-r <report-temp>]\n\n" + "\tlow-pct defaults to %d%% fan\n" + "\tmedium-pct defaults to %d%% fan\n" + "\thigh-pct defaults to %d%% fan\n" + "\ttemp-bottom defaults to %dC\n" + "\ttemp-top defaults to %dC\n" + "\treport-temp defaults to every %d measurements\n\n" + "fand compensates for uServer temperature reading %d degrees low\n" + "kill with SIGUSR1 to stop watchdog\n", + fan_low, + fan_medium, + fan_high, + EXTERNAL_TEMPS(temp_bottom), + EXTERNAL_TEMPS(temp_top), + report_temp, + EXTERNAL_TEMPS(USERVER_TEMP_FUDGE)); + exit(1); +} + +/* We need to open the device each time to read a value */ + +int read_device(const char *device, int *value) { + FILE *fp; + int rc; + + fp = fopen(device, "r"); + if (!fp) { + int err = errno; + + syslog(LOG_INFO, "failed to open device %s", device); + return err; + } + + rc = fscanf(fp, "%d", value); + fclose(fp); + + if (rc != 1) { + syslog(LOG_INFO, "failed to read device %s", device); + return ENOENT; + } else { + return 0; + } +} + +/* We need to open the device again each time to write a value */ + +int write_device(const char *device, const char *value) { + FILE *fp; + int rc; + + fp = fopen(device, "w"); + if (!fp) { + int err = errno; + + syslog(LOG_INFO, "failed to open device for write %s", device); + return err; + } + + rc = fputs(value, fp); + fclose(fp); + + if (rc < 0) { + syslog(LOG_INFO, "failed to write device %s", device); + return ENOENT; + } else { + return 0; + } +} + +int read_temp(const char *device, int *value) { + char full_name[LARGEST_DEVICE_NAME + 1]; + + /* We set an impossible value to check for errors */ + *value = BAD_TEMP; + snprintf( + full_name, LARGEST_DEVICE_NAME, "%s/temp1_input", device); + return read_device(full_name, value); +} + +int read_gpio_value(const int id, const char *device, int *value) { + char full_name[LARGEST_DEVICE_NAME]; + + snprintf(full_name, LARGEST_DEVICE_NAME, device, id); + return read_device(full_name, value); +} + +int read_gpio_values(const int start, const int count, + const char *device, int *result) { + int status = 0; + int value; + + *result = 0; + for (int i = 0; i < count; i++) { + status |= read_gpio_value(start + i, GPIO_PIN_ID, &value); + *result |= value << i; + } + return status; +} + +int read_ids(int *rev_id, int *board_id) { + int status = 0; + int value; + + status = read_gpio_values(GPIO_REV_ID_START, REV_IDS, GPIO_PIN_ID, rev_id); + if (status != 0) { + syslog(LOG_INFO, "failed to read rev_id"); + return status; + } + + int board_id_start; + if (*rev_id >= REV_ID_NEW_BOARD_ID) { + board_id_start = GPIO_BOARD_ID_START_NEW; + } else { + board_id_start = GPIO_BOARD_ID_START; + } + + status = read_gpio_values(board_id_start, BOARD_IDS, GPIO_PIN_ID, board_id); + if (status != 0) { + syslog(LOG_INFO, "failed to read board_id"); + } + return status; +} + +bool is_two_fan_board(bool verbose) { + struct wedge_eeprom_st eeprom; + /* Retrieve the board type from EEPROM */ + if (wedge_eeprom_parse(NULL, &eeprom) == 0) { + /* able to parse EEPROM */ + if (verbose) { + syslog(LOG_INFO, "board type is %s", eeprom.fbw_location); + } + /* only WEDGE is NOT two-fan board */ + return strncasecmp(eeprom.fbw_location, "wedge", + sizeof(eeprom.fbw_location)); + } else { + int status; + int board_id = 0; + int rev_id = 0; + /* + * Could not parse EEPROM. Most likely, it is an old HW without EEPROM. + * In this case, use board ID to distinguish if it is wedge or 6-pack. + */ + status = read_ids(&rev_id, &board_id); + if (verbose) { + syslog(LOG_INFO, "rev ID %d, board id %d", rev_id, board_id); + } + if (status == 0 && board_id != 0xf) { + return true; + } else { + return false; + } + } +} + +int read_fan_value(const int fan, const char *device, int *value) { + char device_name[LARGEST_DEVICE_NAME]; + char output_value[LARGEST_DEVICE_NAME]; + char full_name[LARGEST_DEVICE_NAME]; + + snprintf(device_name, LARGEST_DEVICE_NAME, device, fan); + snprintf(full_name, LARGEST_DEVICE_NAME, "%s/%s", PWM_DIR, device_name); + return read_device(full_name, value); +} + +int write_fan_value(const int fan, const char *device, const int value) { + char full_name[LARGEST_DEVICE_NAME]; + char device_name[LARGEST_DEVICE_NAME]; + char output_value[LARGEST_DEVICE_NAME]; + + snprintf(device_name, LARGEST_DEVICE_NAME, device, fan); + snprintf(full_name, LARGEST_DEVICE_NAME, "%s/%s", PWM_DIR, device_name); + snprintf(output_value, LARGEST_DEVICE_NAME, "%d", value); + return write_device(full_name, output_value); +} + +/* Return fan speed as a percentage of maximum -- not necessarily linear. */ + +int fan_rpm_to_pct(const struct rpm_to_pct_map *table, + const int table_len, + int rpm) { + int i; + + for (i = 0; i < table_len; i++) { + if (table[i].rpm > rpm) { + break; + } + } + + /* + * If the fan RPM is lower than the lowest value in the table, + * we may have a problem -- fans can only go so slow, and it might + * have stopped. In this case, we'll return an interpolated + * percentage, as just returning zero is even more problematic. + */ + + if (i == 0) { + return (rpm * table[i].pct) / table[i].rpm; + } else if (i == table_len) { // Fell off the top? + return table[i - 1].pct; + } + + // Interpolate the right percentage value: + + int percent_diff = table[i].pct - table[i - 1].pct; + int rpm_diff = table[i].rpm - table[i - 1].rpm; + int fan_diff = table[i].rpm - rpm; + + return table[i].pct - (fan_diff * percent_diff / rpm_diff); +} + +int fan_speed_okay(const int fan, const int speed, const int slop) { + int front_fan, rear_fan; + int front_pct, rear_pct; + int real_fan; + int okay; + + /* + * The hardware fan numbers are different from the physical order + * in the box, so we have to map them: + */ + + real_fan = fan_to_rpm_map[fan]; + + front_fan = 0; + rear_fan = 0; + read_fan_value(real_fan, "tacho%d_rpm", &front_fan); + read_fan_value(real_fan + 4, "tacho%d_rpm", &rear_fan); + + front_pct = + fan_rpm_to_pct(rpm_front_map, + sizeof(rpm_front_map) / sizeof(struct rpm_to_pct_map), + front_fan); + rear_pct = + fan_rpm_to_pct(rpm_rear_map, + sizeof(rpm_rear_map) / sizeof(struct rpm_to_pct_map), + rear_fan); + + /* + * If the fans are broken, the measured rate will be rather + * different from the requested rate, and we can turn up the + * rest of the fans to compensate. The slop is the percentage + * of error that we'll tolerate. + * + * XXX: I suppose that we should only measure negative values; + * running too fast isn't really a problem. + */ + + okay = (abs(front_pct - speed) * 100 / speed < slop && + abs(rear_pct - speed) * 100 / speed < slop); + + if (!okay || verbose) { + syslog(!okay ? LOG_ALERT : LOG_INFO, + "fan %d rear %d (%d%%), front %d (%d%%), expected %d", + fan, + rear_fan, + rear_pct, + front_fan, + front_pct, + speed); + } + + return okay; +} + +/* Set fan speed as a percentage */ + +int write_fan_speed(const int fan, const int value) { + /* + * The hardware fan numbers for pwm control are different from + * both the physical order in the box, and the mapping for reading + * the RPMs per fan, above. + */ + + int real_fan = fan_to_pwm_map[fan]; + + if (value == 0) { + return write_fan_value(real_fan, "pwm%d_en", 0); + } else { + int unit = value * PWM_UNIT_MAX / 100; + int status; + + if (unit == PWM_UNIT_MAX) + unit = 0; + + if ((status = write_fan_value(real_fan, "pwm%d_type", 0)) != 0 || + (status = write_fan_value(real_fan, "pwm%d_rising", 0)) != 0 || + (status = write_fan_value(real_fan, "pwm%d_falling", unit)) != 0 || + (status = write_fan_value(real_fan, "pwm%d_en", 1)) != 0) { + return status; + } + } +} + +/* Set up fan LEDs */ + +int write_fan_led(const int fan, const char *color) +{ + return write_device(fan_led[fan], color); +} + +int server_shutdown(const char *why) { + int fan; + for (fan = 0; fan < total_fans; fan++) { + write_fan_speed(fan + fan_offset, fan_max); + } + + syslog(LOG_EMERG, "Shutting down: %s", why); + write_device(GPIO_USERVER_POWER_DIRECTION, "out"); + write_device(GPIO_USERVER_POWER, "0"); + /* + * Putting T2 in reset generating a non-maskable interrupt to uS, + * the kernel running on uS might panic depending on its version. + * sleep 5s here to make sure uS is completely down. + */ + sleep(5); + write_device(GPIO_T2_POWER_DIRECTION, "out"); + write_device(GPIO_T2_POWER, "0"); + + /* + * We have to stop the watchdog, or the system will be automatically + * rebooted some seconds after fand exits (and stops kicking the + * watchdog). + */ + + stop_watchdog(); + + sleep(2); + exit(2); +} + +/* Gracefully shut down on receipt of a signal */ + +void fand_interrupt(int sig) +{ + int fan; + for (fan = 0; fan < total_fans; fan++) { + write_fan_speed(fan + fan_offset, fan_max); + } + + syslog(LOG_ALERT, "Shutting down fand on signal %s", strsignal(sig)); + if (sig == SIGUSR1) { + stop_watchdog(); + } + exit(3); +} + +int main(int argc, char **argv) { + /* Sensor values */ + + int intake_temp; + int exhaust_temp; + int t2_temp; + int userver_temp; + + int fan_speed = FAN_HIGH; + int bad_reads = 0; + int fan_failure = 0; + int fan_speed_changes = 0; + int old_speed; + + int fan_bad[FANS]; + int fan; + + unsigned log_count = 0; // How many times have we logged our temps? + int opt; + int prev_fans_bad = 0; + + struct sigaction sa; + + sa.sa_handler = fand_interrupt; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + + sigaction(SIGTERM, &sa, NULL); + sigaction(SIGINT, &sa, NULL); + sigaction(SIGUSR1, &sa, NULL); + + while ((opt = getopt(argc, argv, "l:m:h:b:t:r:v")) != -1) { + switch (opt) { + case 'l': + fan_low = atoi(optarg); + break; + case 'm': + fan_medium = atoi(optarg); + break; + case 'h': + fan_high = atoi(optarg); + break; + case 'b': + temp_bottom = INTERNAL_TEMPS(atoi(optarg)); + break; + case 't': + temp_top = INTERNAL_TEMPS(atoi(optarg)); + break; + case 'r': + report_temp = atoi(optarg); + break; + case 'v': + verbose = true; + break; + default: + usage(); + break; + } + } + + if (optind > argc) { + usage(); + } + + if (temp_bottom > temp_top) { + fprintf(stderr, + "Should temp-bottom (%d) be higher than " + "temp-top (%d)? Starting anyway.\n", + EXTERNAL_TEMPS(temp_bottom), + EXTERNAL_TEMPS(temp_top)); + } + + if (fan_low > fan_medium || fan_low > fan_high || fan_medium > fan_high) { + fprintf(stderr, + "fan RPMs not strictly increasing " + "-- %d, %d, %d, starting anyway\n", + fan_low, + fan_medium, + fan_high); + } + + daemon(1, 0); + openlog("fand", LOG_CONS, LOG_DAEMON); + + /* Start watchdog in manual mode */ + start_watchdog(0); + + /* Set watchdog to persistent mode so timer expiry will happen independent + * of this process's liveliness. */ + set_persistent_watchdog(WATCHDOG_SET_PERSISTENT); + + if (is_two_fan_board(verbose)) { + /* Alternate, two fan configuration */ + total_fans = 2; + fan_offset = 2; /* fan 3 is the first */ + } + + for (fan = 0; fan < total_fans; fan++) { + fan_bad[fan] = 0; + write_fan_speed(fan + fan_offset, fan_speed); + write_fan_led(fan + fan_offset, FAN_LED_BLUE); + } + + sleep(5); /* Give the fans time to come up to speed */ + + while (1) { + int max_temp; + old_speed = fan_speed; + + /* Read sensors */ + + read_temp(INTAKE_TEMP_DEVICE, &intake_temp); + read_temp(EXHAUST_TEMP_DEVICE, &exhaust_temp); + read_temp(T2_TEMP_DEVICE, &t2_temp); + read_temp(USERVER_TEMP_DEVICE, &userver_temp); + + /* + * uServer can be powered down, but all of the rest of the sensors + * should be readable at any time. + */ + + if ((intake_temp == BAD_TEMP || exhaust_temp == BAD_TEMP || + t2_temp == BAD_TEMP)) { + bad_reads++; + } + + if (bad_reads > BAD_READ_THRESHOLD) { + server_shutdown("Some sensors couldn't be read"); + } + + if (log_count++ % report_temp == 0) { + syslog(LOG_DEBUG, + "Temp intake %d, t2 %d, userver %d, exhaust %d, " + "fan speed %d, speed changes %d", + intake_temp, + t2_temp, + userver_temp, + exhaust_temp, + fan_speed, + fan_speed_changes); + } + + /* Protection heuristics */ + + if (intake_temp > INTAKE_LIMIT) { + server_shutdown("Intake temp limit reached"); + } + + if (t2_temp > T2_LIMIT) { + server_shutdown("T2 temp limit reached"); + } + + if (userver_temp + USERVER_TEMP_FUDGE > USERVER_LIMIT) { + server_shutdown("uServer temp limit reached"); + } + + /* + * Calculate change needed -- we should eventually + * do something more sophisticated, like PID. + * + * We should use the intake temperature to adjust this + * as well. + */ + + if (t2_temp > userver_temp + USERVER_TEMP_FUDGE) { + max_temp = t2_temp; + } else { + max_temp = userver_temp + USERVER_TEMP_FUDGE; + } + + /* + * If recovering from a fan problem, spin down fans gradually in case + * temperatures are still high. Gradual spin down also reduces wear on + * the fans. + */ + if (fan_speed == fan_max) { + if (fan_failure == 0) { + fan_speed = fan_high; + } + } else if (fan_speed == fan_high) { + if (max_temp + COOLDOWN_SLOP < temp_top) { + fan_speed = fan_medium; + } + } else if (fan_speed == fan_medium) { + if (max_temp > temp_top) { + fan_speed = fan_high; + } else if (max_temp + COOLDOWN_SLOP < temp_bottom) { + fan_speed = fan_low; + } + } else {/* low */ + if (max_temp > temp_bottom) { + fan_speed = fan_medium; + } + } + + /* + * Update fans only if there are no failed ones. If any fans failed + * earlier, all remaining fans should continue to run at max speed. + */ + + if (fan_failure == 0 && fan_speed != old_speed) { + syslog(LOG_NOTICE, + "Fan speed changing from %d to %d", + old_speed, + fan_speed); + fan_speed_changes++; + for (fan = 0; fan < total_fans; fan++) { + write_fan_speed(fan + fan_offset, fan_speed); + } + } + + /* + * Wait for some change. Typical I2C temperature sensors + * only provide a new value every second and a half, so + * checking again more quickly than that is a waste. + * + * We also have to wait for the fan changes to take effect + * before measuring them. + */ + + sleep(5); + + /* Check fan RPMs */ + + for (fan = 0; fan < total_fans; fan++) { + /* + * Make sure that we're within some percentage + * of the requested speed. + */ + if (fan_speed_okay(fan + fan_offset, fan_speed, FAN_FAILURE_OFFSET)) { + if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) { + write_fan_led(fan + fan_offset, FAN_LED_BLUE); + syslog(LOG_NOTICE, + "Fan %d has recovered", + fan); + } + fan_bad[fan] = 0; + } else { + fan_bad[fan]++; + } + } + + fan_failure = 0; + for (fan = 0; fan < total_fans; fan++) { + if (fan_bad[fan] > FAN_FAILURE_THRESHOLD) { + fan_failure++; + write_fan_led(fan + fan_offset, FAN_LED_RED); + } + } + + if (fan_failure > 0) { + if (prev_fans_bad != fan_failure) { + syslog(LOG_ALERT, "%d fans failed", fan_failure); + } + + /* + * If fans are bad, we need to blast all of the + * fans at 100%; we don't bother to turn off + * the bad fans, in case they are all that is left. + * + * Note that we have a temporary bug with setting fans to + * 100% so we only do fan_max = 99%. + */ + + fan_speed = fan_max; + for (fan = 0; fan < total_fans; fan++) { + write_fan_speed(fan + fan_offset, fan_speed); + } + + /* + * Fans can be hot swapped and replaced; in which case the fan daemon + * will automatically detect the new fan and (assuming the new fan isn't + * itself faulty), automatically readjust the speeds for all fans down + * to a more suitable rpm. The fan daemon does not need to be restarted. + */ + } + + /* Suppress multiple warnings for similar number of fan failures. */ + prev_fans_bad = fan_failure; + + /* if everything is fine, restart the watchdog countdown. If this process + * is terminated, the persistent watchdog setting will cause the system + * to reboot after the watchdog timeout. */ + kick_watchdog(); + } +} |