Fix stdio performance & memory problems

12 Aug 2014

Hey guys! After spending several weeks messing around with FATFileSystem in order to create my custom SDFileSystem library, I've come to the conclusion that the current stdio retarget layer is very inefficient. Especially on platforms using the μARM toolchain, which is most of them.

For starters, fopen() takes an atrocious amount of memory to open a file. In my experiments, threads calling fopen() needed at least 1600B of stack on an LPC11U24, which is already 20% of the available memory. To make matters worse, fclose() has a known memory leak on microlib that hasn't been fixed for almost a year now. Digging into the built-in mbed filesystem API reveals that FATFileSystem's open() method should require less than 700B of stack, and microlib only uses 20B of heap per FILE object... So where is the other 900B going?

Next up, read/write performance. This may be fine on the LPC1768 (which uses the standard toolchain), but it absolutely sucks on everything else. To prove this, I wrote a test application that accesses files using three different APIs: the stdio API, the built-in mbed filesystem API, and the raw FatFs API:

main.cpp

#include "mbed.h"
#include "SDFileSystem.h"

Timer timer;
SDFileSystem sd(p5, p6, p7, p20, "sd", p22, SDFileSystem::SWITCH_NO, 20000000);
char buffer[1024];

void testStdio()
{
    FILE* file;

    //Test read performance using a 10MB file
    printf("\nTesting stdio %iB read performance...", sizeof(buffer));
    file = fopen("/sd/Test File.bin", "r");
    if (file != NULL) {
        timer.start();
        while (fread(buffer, sizeof(char), sizeof(buffer), file) == sizeof(buffer));
        timer.stop();
        fclose(file);
        printf("done!\n\tResult: %.2fKB/s\n", 10240 / (timer.read_us() / 1000000.0));
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }

    //Test write performance by creating a 1MB file
    printf("Testing stdio %iB write performance...", sizeof(buffer));
    file = fopen("/sd/Write Test.bin", "w");
    if (file != NULL) {
        timer.start();
        for (int i = 0; i < (1048576 / sizeof(buffer)); i++) {
            if (fwrite(buffer, sizeof(char), sizeof(buffer), file) != sizeof(buffer)) {
                error("write error!\n");
            }
        }
        timer.stop();
        fclose(file);
        printf("done!\n\tResult: %.2fKB/s\n", 1024 / (timer.read_us() / 1000000.0));
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }
}

void testMbed()
{
    FileHandle* file;

    //Test read performance using a 10MB file
    printf("\nTesting mbed %iB read performance...", sizeof(buffer));
    file = sd.open("Test File.bin", O_RDONLY);
    if (file != NULL) {
        timer.start();
        while (file->read(buffer, sizeof(buffer)) == sizeof(buffer));
        timer.stop();
        file->close();
        printf("done!\n\tResult: %.2fKB/s\n", 10240 / (timer.read_us() / 1000000.0));
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }

    //Test write performance by creating a 1MB file
    printf("Testing mbed %iB write performance...", sizeof(buffer));
    file = sd.open("Write Test.bin", O_WRONLY | O_CREAT | O_TRUNC);
    if (file != NULL) {
        timer.start();
        for (int i = 0; i < (1048576 / sizeof(buffer)); i++) {
            if (file->write(buffer, sizeof(buffer)) != sizeof(buffer)) {
                error("write error!\n");
            }
        }
        timer.stop();
        file->close();
        printf("done!\n\tResult: %.2fKB/s\n", 1024 / (timer.read_us() / 1000000.0));
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }
}

void testFatFs()
{
    FIL file;
    FRESULT res;
    unsigned int bytes;

    //Test read performance using a 10MB file
    printf("\nTesting FatFs %iB read performance...", sizeof(buffer));
    res = f_open(&file, "0:Test File.bin", FA_READ | FA_OPEN_EXISTING);
    if (res == FR_OK) {
        timer.start();
        do {
            res = f_read(&file, buffer, sizeof(buffer), &bytes);
        } while (res == FR_OK && bytes == sizeof(buffer));
        timer.stop();
        if (f_close(&file) == FR_OK)
            printf("done!\n\tResult: %.2fKB/s\n", 10240 / (timer.read_us() / 1000000.0));
        else
            printf("failed to close file!\n");
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }

    //Test write performance by creating a 1MB file
    printf("Testing FatFs %iB write performance...", sizeof(buffer));
    res = f_open(&file, "0:Write Test.bin", FA_WRITE | FA_CREATE_ALWAYS);
    if (res == FR_OK) {
        timer.start();
        for (int i = 0; i < 1024; i++) {
            if (f_write(&file, buffer, 1024, &bytes) != FR_OK)
                error("write error!\n");
            else if (bytes != 1024)
                break;
        }
        timer.stop();
        if (f_close(&file) == FR_OK)
            printf("done!\n\tResult: %.2fKB/s\n", 1024 / (timer.read_us() / 1000000.0));
        else
            printf("failed to close file!\n");
        timer.reset();
    } else {
        printf("failed to open file!\n");
    }
}

int main()
{
    //Configure CRC and large frames
    sd.crc(true);
    sd.large_frames(true);

    //Print out the card info (to make sure it's initialized before testing)
    printf("Card type: ");
    if (sd.card_type() == SDFileSystem::CARD_NONE)
        printf("None\n");
    else if (sd.card_type() == SDFileSystem::CARD_MMC)
        printf("MMC\n");
    else if (sd.card_type() == SDFileSystem::CARD_SD)
        printf("SD\n");
    else if (sd.card_type() == SDFileSystem::CARD_SDHC)
        printf("SDHC\n");
    else
        printf("Unknown\n");
    printf("Sectors: %llu\n", sd.disk_sectors());
    printf("Capacity: %.1fMB\n", (sd.disk_sectors() * 512) / 1048576.0);

    //Test the performance of the three different APIs
    testStdio();
    testMbed();
    testFatFs();

    //Print the final message
    printf("\nTesting complete!\n");

    //Don't exit main!
    while(1);
}


And here are the results on an LPC11U24 using a Canon 32MB SDCv1 card:
/media/uploads/neilt6/32mb_benchmarks.jpg

...and a Patriot 8GB SDHCv2 card:
/media/uploads/neilt6/8gb_benchmarks.jpg

According to these results, the built-in mbed filesystem API adds virtually no overhead to the raw FatFs API, and yet the stdio API can only achieve 36% of the read throughput? Something is clearly wrong here!