Как рекурсивно перечислить каталоги в C на Linux?

Мне нужно рекурсивно перечислить все каталоги и файлы в C-программировании. Я посмотрел в FTW, но это не входит в 2 операционные системы, которые я использую (Fedora и Minix). Я начинаю получать большую головную боль из всех вещей, которые я прочитал за последние несколько часов.

Если кто-то знает фрагмент кода, я мог бы посмотреть, что это было бы потрясающе, или если кто-то может дать мне хорошее направление, я был бы очень благодарен.

Ответ 1

Вот рекурсивная версия:

#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
#include <string.h>

void listdir(const char *name, int indent)
{
    DIR *dir;
    struct dirent *entry;

    if (!(dir = opendir(name)))
        return;

    while ((entry = readdir(dir)) != NULL) {
        if (entry->d_type == DT_DIR) {
            char path[1024];
            if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
                continue;
            snprintf(path, sizeof(path), "%s/%s", name, entry->d_name);
            printf("%*s[%s]\n", indent, "", entry->d_name);
            listdir(path, indent + 2);
        } else {
            printf("%*s- %s\n", indent, "", entry->d_name);
        }
    }
    closedir(dir);
}

int main(void) {
    listdir(".", 0);
    return 0;
}

Ответ 2

Почему все настаивают на повторном создании колеса снова и снова?

POSIX.1-2008 стандартизовал функцию nftw(), также определенную в спецификации Single Unix Specification v4 (SuSv4) и доступную в Linux (glibc, man 3 nftw), OS X и большинство существующих вариантов BSD. Это не ново.

Naïve opendir()/readdir()/closedir() основанные реализации почти никогда не обрабатывают случаи, когда каталоги или файлы перемещаются, переименовываются или удаляются во время обхода дерева, тогда как nftw() должно обрабатывать их изящно.

В качестве примера рассмотрим следующую программу на языке C, которая отображает дерево каталогов, начиная с текущего рабочего каталога, или в каждом из каталогов, названных в командной строке, или просто файлы, названные в командной строке:

/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */
#define _XOPEN_SOURCE 700

/* Added on 2017-06-25:
   If the C library can support 64-bit file sizes
   and offsets, using the standard names,
   these defines tell the C library to do so. */
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64 

#include <stdlib.h>
#include <unistd.h>
#include <ftw.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>

/* POSIX.1 says each process has at least 20 file descriptors.
 * Three of those belong to the standard streams.
 * Here, we use a conservative estimate of 15 available;
 * assuming we use at most two for other uses in this program,
 * we should never run into any problems.
 * Most trees are shallower than that, so it is efficient.
 * Deeper trees are traversed fine, just a bit slower.
 * (Linux allows typically hundreds to thousands of open files,
 *  so you'll probably never see any issues even if you used
 *  a much higher value, say a couple of hundred, but
 *  15 is a safe, reasonable value.)
*/
#ifndef USE_FDS
#define USE_FDS 15
#endif

int print_entry(const char *filepath, const struct stat *info,
                const int typeflag, struct FTW *pathinfo)
{
    /* const char *const filename = filepath + pathinfo->base; */
    const double bytes = (double)info->st_size; /* Not exact if large! */
    struct tm mtime;

    localtime_r(&(info->st_mtime), &mtime);

    printf("%04d-%02d-%02d %02d:%02d:%02d",
           mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday,
           mtime.tm_hour, mtime.tm_min, mtime.tm_sec);

    if (bytes >= 1099511627776.0)
        printf(" %9.3f TiB", bytes / 1099511627776.0);
    else
    if (bytes >= 1073741824.0)
        printf(" %9.3f GiB", bytes / 1073741824.0);
    else
    if (bytes >= 1048576.0)
        printf(" %9.3f MiB", bytes / 1048576.0);
    else
    if (bytes >= 1024.0)
        printf(" %9.3f KiB", bytes / 1024.0);
    else
        printf(" %9.0f B  ", bytes);

    if (typeflag == FTW_SL) {
        char   *target;
        size_t  maxlen = 1023;
        ssize_t len;

        while (1) {

            target = malloc(maxlen + 1);
            if (target == NULL)
                return ENOMEM;

            len = readlink(filepath, target, maxlen);
            if (len == (ssize_t)-1) {
                const int saved_errno = errno;
                free(target);
                return saved_errno;
            }
            if (len >= (ssize_t)maxlen) {
                free(target);
                maxlen += 1024;
                continue;
            }

            target[len] = '\0';
            break;
        }

        printf(" %s -> %s\n", filepath, target);
        free(target);

    } else
    if (typeflag == FTW_SLN)
        printf(" %s (dangling symlink)\n", filepath);
    else
    if (typeflag == FTW_F)
        printf(" %s\n", filepath);
    else
    if (typeflag == FTW_D || typeflag == FTW_DP)
        printf(" %s/\n", filepath);
    else
    if (typeflag == FTW_DNR)
        printf(" %s/ (unreadable)\n", filepath);
    else
        printf(" %s (unknown)\n", filepath);

    return 0;
}


int print_directory_tree(const char *const dirpath)
{
    int result;

    /* Invalid directory path? */
    if (dirpath == NULL || *dirpath == '\0')
        return errno = EINVAL;

    result = nftw(dirpath, print_entry, USE_FDS, FTW_PHYS);
    if (result >= 0)
        errno = result;

    return errno;
}

int main(int argc, char *argv[])
{
    int arg;

    if (argc < 2) {

        if (print_directory_tree(".")) {
            fprintf(stderr, "%s.\n", strerror(errno));
            return EXIT_FAILURE;
        }

    } else {

        for (arg = 1; arg < argc; arg++) {
            if (print_directory_tree(argv[arg])) {
                fprintf(stderr, "%s.\n", strerror(errno));
                return EXIT_FAILURE;
            }
        }

    }

    return EXIT_SUCCESS;
}

Большая часть приведенного выше кода находится в print_entry(). Его задача - распечатать каждую запись в каталоге. В print_directory_tree() мы говорим nftw(), чтобы вызвать его для каждой записи в каталоге, которую он видит.

Единственная волнистая деталь, описанная выше, - это решение о том, сколько дескрипторов файлов следует использовать nftw(). Если ваша программа использует не более двух дополнительных файловых дескрипторов (в дополнение к стандартным потокам) во время ходьбы файлового дерева, 15, как известно, безопасна (во всех системах с nftw() и в основном совместима с POSIX).

В Linux вы можете использовать sysconf(_SC_OPEN_MAX), чтобы найти максимальное количество открытых файлов и вычесть число, которое вы используете одновременно с вызовом nftw(), но я бы не стал беспокоиться (если бы я не знал, что утилита будет использоваться в основном с патологически глубокими структурами каталогов). Пятнадцать дескрипторов не ограничивают глубину дерева; nftw() просто становится медленнее (и может не обнаруживать изменения в каталоге, если ходя по директории с более чем 13 каталогами из этого, хотя компромиссы и общая способность обнаруживать изменения различаются между системами и реализациями библиотеки C). Просто используя константу времени компиляции, поддерживающую переносимый код, она должна работать не только на Linux, но и на Mac OS X, а также на всех существующих вариантах BSD и большинстве других не слишком старых Unix-вариантов.

В комментарии Руслан упомянул, что им пришлось переключиться на nftw64(), потому что у них были записи файловой системы, которые требовали 64-битных размеров/смещений, а "нормальная" версия nftw() не удалась с errno == EOVERFLOW. Правильное решение состоит в том, чтобы не переключать на GLIBC-специфичные 64-битные функции, а определять _LARGEFILE64_SOURCE и _FILE_OFFSET_BITS 64. Они сообщают библиотеке C переключиться на 64-битные размеры и смещения файлов, если это возможно, при использовании стандартных функций (nftw(), fstat() и т.д.) И имен типов (off_t и т.д.).

Ответ 3

int is_directory_we_want_to_list(const char *parent, char *name) {
  struct stat st_buf;
  if (!strcmp(".", name) || !strcmp("..", name))
    return 0;
  char *path = alloca(strlen(name) + strlen(parent) + 2);
  sprintf(path, "%s/%s", parent, name);
  stat(path, &st_buf);
  return S_ISDIR(st_buf.st_mode);
}

int list(const char *name) {
  DIR *dir = opendir(name);
  struct dirent *ent;
  while (ent = readdir(dir)) {
    char *entry_name = ent->d_name;
    printf("%s\n", entry_name);
    if (is_directory_we_want_to_list(name, entry_name)) {
      // You can consider using alloca instead.
      char *next = malloc(strlen(name) + strlen(entry_name) + 2);
      sprintf(next, "%s/%s", name, entry_name);
      list(next);
      free(next);
    }
  }
  closedir(dir);
}

Заголовки файлов, которые следует снять в этом контексте: stat.h, dirent.h. Имейте в виду, что приведенный выше код не проверяет наличие каких-либо ошибок.

По-разному предлагается ftw, определенная в ftw.h.

Ответ 4

Как я уже упоминал в своем комментарии, я считаю, что рекурсивный подход имеет два недостатка в этой задаче.

Первым недостатком является ограничение на открытые файлы. Этот предел накладывает ограничение на глубокий обход. Если вложенных папок достаточно, рекурсивный подход сломается. (См. редактирование)

Второй недостаток немного более тонкий. Рекурсивный подход очень трудно проверить на жесткие ссылки. Если дерево папок является циклическим (из-за жестких ссылок), рекурсивный подход сломается (надеюсь, без). (См. редактирование жестких ссылок)

Однако довольно просто избежать этих проблем, заменив рекурсию на один файловый дескриптор и связанные списки.

Я предполагаю, что это не школьный проект, и что рекурсия необязательна.

Вот пример приложения.

Используйте a.out ./ для просмотра дерева папок.

Я приношу свои извинения за макросы и прочее... Обычно я использую встроенные функции, но я думал, что будет легче следовать за кодом, если все это в одной функции.

#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

int main(int argc, char const *argv[]) {
  /* print use instruction unless a folder name was given */
  if (argc < 2)
    fprintf(stderr,
            "\nuse:\n"
            "    %s <directory>\n"
            "for example:\n"
            "    %s ./\n\n",
            argv[0], argv[0]),
        exit(0);

  /*************** a small linked list macro implementation ***************/

  typedef struct list_s {
    struct list_s *next;
    struct list_s *prev;
  } list_s;

#define LIST_INIT(name)                                                        \
  { .next = &name, .prev = &name }

#define LIST_PUSH(dest, node)                                                  \
  do {                                                                         \
    (node)->next = (dest)->next;                                               \
    (node)->prev = (dest);                                                     \
    (node)->next->prev = (node);                                               \
    (dest)->next = (node);                                                     \
  } while (0);

#define LIST_POP(list, var)                                                    \
  if ((list)->next == (list)) {                                                \
    var = NULL;                                                                \
  } else {                                                                     \
    var = (list)->next;                                                        \
    (list)->next = var->next;                                                  \
    var->next->prev = var->prev;                                               \
  }

  /*************** a record (file / folder) item type ***************/

  typedef struct record_s {
    /* this is a flat processing queue. */
    list_s queue;
    /* this will list all queued and processed folders (cyclic protection) */
    list_s folders;
    /* this will list all the completed items (siblings and such) */
    list_s list;
    /* unique ID */
    ino_t ino;
    /* name length */
    size_t len;
    /* name string */
    char name[];
  } record_s;

/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name)                                           \
  ((record_s *)(((uintptr_t)(node)) -                                          \
                ((uintptr_t) & ((record_s *)0)->list_name)))

/* initializes a new record */
#define RECORD_INIT(name)                                                      \
  (record_s){.queue = LIST_INIT((name).queue),                                 \
             .folders = LIST_INIT((name).folders),                             \
             .list = LIST_INIT((name).list)}

  /*************** the actual code ***************/

  record_s records = RECORD_INIT(records);
  record_s *pos, *item;
  list_s *tmp;
  DIR *dir;
  struct dirent *entry;

  /* initialize the root folder record and add it to the queue */
  pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
  *pos = RECORD_INIT(*pos);
  pos->len = strlen(argv[1]);
  memcpy(pos->name, argv[1], pos->len);
  if (pos->name[pos->len - 1] != '/')
    pos->name[pos->len++] = '/';
  pos->name[pos->len] = 0;
  /* push to queue, but also push to list (first item processed) */
  LIST_PUSH(&records.queue, &pos->queue);
  LIST_PUSH(&records.list, &pos->list);

  /* as long as the queue has items to be processed, do so */
  while (records.queue.next != &records.queue) {
    /* pop queued item */
    LIST_POP(&records.queue, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, queue);
    /* add record to the processed folder list */
    LIST_PUSH(&records.folders, &pos->folders);

    /* process the folder and add all folder data to current list */
    dir = opendir(pos->name);
    if (!dir)
      continue;

    while ((entry = readdir(dir)) != NULL) {

      /* create new item, copying it path data and unique ID */
      item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
      *item = RECORD_INIT(*item);
      item->len = pos->len + entry->d_namlen;
      memcpy(item->name, pos->name, pos->len);
      memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
      item->name[item->len] = 0;
      item->ino = entry->d_ino;
      /* add item to the list, right after the `pos` item */
      LIST_PUSH(&pos->list, &item->list);

      /* unless it a folder, we're done. */
      if (entry->d_type != DT_DIR)
        continue;

      /* test for '.' and '..' */
      if (entry->d_name[0] == '.' &&
          (entry->d_name[1] == 0 ||
           (entry->d_name[1] == '.' && entry->d_name[2] == 0)))
        continue;

      /* add folder marker */
      item->name[item->len++] = '/';
      item->name[item->len] = 0;

      /* test for cyclic processing */
      list_s *t = records.folders.next;
      while (t != &records.folders) {
        if (NODE2RECORD(t, folders)->ino == item->ino) {
          /* we already processed this folder! */
          break; /* this breaks from the small loop... */
        }
        t = t->next;
      }
      if (t != &records.folders)
        continue; /* if we broke from the small loop, entry is done */

      /* item is a new folder, add to queue */
      LIST_PUSH(&records.queue, &item->queue);
    }
    closedir(dir);
  }

  /*************** Printing the results and cleaning up ***************/
  while (records.list.next != &records.list) {
    /* pop list item */
    LIST_POP(&records.list, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, list);
    /* prepare for next iteration */
    LIST_POP(&records.list, tmp);
    fwrite(pos->name, pos->len, 1, stderr);
    fwrite("\n", 1, 1, stderr);
    free(pos);
  }
  return 0;
}

ИЗМЕНИТЬ

@Stargateur упоминает в комментариях, что рекурсивный код, вероятно, переполнит стек до достижения предела открытого файла.

Хотя я не вижу, как переполнение стека лучше, эта оценка, вероятно, правильная, если процесс не близок к пределу файла при вызове.

Еще один момент, упомянутый @Stargateur в комментариях, заключался в том, что глубина рекурсивного кода ограничена максимальным количеством подкаталогов (64000 в файловой системе ext4) и что жесткие ссылки крайне маловероятны (поскольку жесткие ссылки на папки не допускаются в Linux/Unix).

Это хорошая новость, если код работает в Linux (что, по мнению этого вопроса), так что эта проблема не является реальной проблемой (если только не выполняется код на macOS или, может быть, на Windows)... хотя 64K вложенных папок в рекурсии может сильно разбить стек.

Сказав это, ни один рекурсивный вариант не имеет преимуществ, таких как возможность легко добавлять ограничение на количество обрабатываемых элементов, а также возможность кэширования результата.

P.S.

В соответствии с комментариями здесь представлена нерекурсивная версия кода, которая не проверяет циклические иерархии. Это быстрее и должно быть достаточно безопасным для использования на машине Linux, где не разрешены жесткие ссылки на папки.

#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>

int main(int argc, char const *argv[]) {
  /* print use instruction unless a folder name was given */
  if (argc < 2)
    fprintf(stderr,
            "\nuse:\n"
            "    %s <directory>\n"
            "for example:\n"
            "    %s ./\n\n",
            argv[0], argv[0]),
        exit(0);

  /*************** a small linked list macro implementation ***************/

  typedef struct list_s {
    struct list_s *next;
    struct list_s *prev;
  } list_s;

#define LIST_INIT(name)                                                        \
  { .next = &name, .prev = &name }

#define LIST_PUSH(dest, node)                                                  \
  do {                                                                         \
    (node)->next = (dest)->next;                                               \
    (node)->prev = (dest);                                                     \
    (node)->next->prev = (node);                                               \
    (dest)->next = (node);                                                     \
  } while (0);

#define LIST_POP(list, var)                                                    \
  if ((list)->next == (list)) {                                                \
    var = NULL;                                                                \
  } else {                                                                     \
    var = (list)->next;                                                        \
    (list)->next = var->next;                                                  \
    var->next->prev = var->prev;                                               \
  }

  /*************** a record (file / folder) item type ***************/

  typedef struct record_s {
    /* this is a flat processing queue. */
    list_s queue;
    /* this will list all the completed items (siblings and such) */
    list_s list;
    /* unique ID */
    ino_t ino;
    /* name length */
    size_t len;
    /* name string */
    char name[];
  } record_s;

/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name)                                           \
  ((record_s *)(((uintptr_t)(node)) -                                          \
                ((uintptr_t) & ((record_s *)0)->list_name)))

/* initializes a new record */
#define RECORD_INIT(name)                                                      \
  (record_s){.queue = LIST_INIT((name).queue), .list = LIST_INIT((name).list)}

  /*************** the actual code ***************/

  record_s records = RECORD_INIT(records);
  record_s *pos, *item;
  list_s *tmp;
  DIR *dir;
  struct dirent *entry;

  /* initialize the root folder record and add it to the queue */
  pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
  *pos = RECORD_INIT(*pos);
  pos->len = strlen(argv[1]);
  memcpy(pos->name, argv[1], pos->len);
  if (pos->name[pos->len - 1] != '/')
    pos->name[pos->len++] = '/';
  pos->name[pos->len] = 0;
  /* push to queue, but also push to list (first item processed) */
  LIST_PUSH(&records.queue, &pos->queue);
  LIST_PUSH(&records.list, &pos->list);

  /* as long as the queue has items to be processed, do so */
  while (records.queue.next != &records.queue) {
    /* pop queued item */
    LIST_POP(&records.queue, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, queue);

    /* process the folder and add all folder data to current list */
    dir = opendir(pos->name);
    if (!dir)
      continue;

    while ((entry = readdir(dir)) != NULL) {

      /* create new item, copying it path data and unique ID */
      item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
      *item = RECORD_INIT(*item);
      item->len = pos->len + entry->d_namlen;
      memcpy(item->name, pos->name, pos->len);
      memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
      item->name[item->len] = 0;
      item->ino = entry->d_ino;
      /* add item to the list, right after the `pos` item */
      LIST_PUSH(&pos->list, &item->list);

      /* unless it a folder, we're done. */
      if (entry->d_type != DT_DIR)
        continue;

      /* test for '.' and '..' */
      if (entry->d_name[0] == '.' &&
          (entry->d_name[1] == 0 ||
           (entry->d_name[1] == '.' && entry->d_name[2] == 0)))
        continue;

      /* add folder marker */
      item->name[item->len++] = '/';
      item->name[item->len] = 0;

      /* item is a new folder, add to queue */
      LIST_PUSH(&records.queue, &item->queue);
    }
    closedir(dir);
  }

  /*************** Printing the results and cleaning up ***************/
  while (records.list.next != &records.list) {
    /* pop list item */
    LIST_POP(&records.list, tmp);
    /* collect record to process */
    pos = NODE2RECORD(tmp, list);
    /* prepare for next iteration */
    LIST_POP(&records.list, tmp);
    fwrite(pos->name, pos->len, 1, stderr);
    fwrite("\n", 1, 1, stderr);
    free(pos);
  }
  return 0;
}

Ответ 5

Вот упрощенная версия, которая является рекурсивной, но использует намного меньше стекового пространства:

#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>

void listdir(char *path, size_t size) {
    DIR *dir;
    struct dirent *entry;
    size_t len = strlen(path);

    if (!(dir = opendir(path))) {
        fprintf(stderr, "path not found: %s: %s\n",
                path, strerror(errno));
        return;
    }

    puts(path);
    while ((entry = readdir(dir)) != NULL) {
        char *name = entry->d_name;
        if (entry->d_type == DT_DIR) {
            if (!strcmp(name, ".") || !strcmp(name, ".."))
                continue;
            if (len + strlen(name) + 2 > size) {
                fprintf(stderr, "path too long: %s/%s\n", path, name);
            } else {
                path[len] = '/';
                strcpy(path + len + 1, name);
                listdir(path, size);
                path[len] = '\0';
            }
        } else {
            printf("%s/%s\n", path, name);
        }
    }
    closedir(dir);
}

int main(void) {
    char path[1024] = ".";
    listdir(path, sizeof path);
    return 0;
}

В моей системе его вывод точно совпадает с выводом find.