我运行了Check50,我皱起了眉头,因为"拼写判断不区分大小写","正确地处理了大多数基本的单词".

我怎么才能解决这个问题呢?这和我的散列函数有关吗?

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>

#include "dictionary.h"

// Represents a node in a hash table
typedef struct node
{
    char word[LENGTH + 1];
    struct node *next;
}
node;

// Number of words in dictionary
int word_count = 0;

// Number of buckets in hash table
const unsigned int N = 26;

// Hash table
node *table[N];

// Returns true if word is in dictionary, else false
bool check(const char *word)
{
    unsigned int n = hash(word);

    node *cursor = table[n];

    while (cursor != NULL)
    {
        if (strcasecmp(word, cursor->word) == 0)
        {
            return true;
        }

        cursor = cursor->next;
    }
    return false;
}

// Hashes word to a number
// Function credit to staff on CS50 reddit page
unsigned int hash(const char *word)
{
    unsigned int hash_value = 0;

    for (int i = 0, n = strlen(word); i < n; i++)
    {
         hash_value = (hash_value << 2) ^ word[i];
    }
    return hash_value % N; 
}

// Loads dictionary into memory, returning true if successful else false
bool load(const char *dictionary)
{
    // Open dictionary and check for memory issue
    //Function guide credit to CS50 Guide by Anvea on YouTube
    FILE *dict = fopen(dictionary, "r");
    char word[LENGTH + 1];

    // Check for memory issue with dict
    if (dict == NULL)
    {
        printf("Dictionary is null\n");
        unload();
        return false;
    }

    while (fscanf(dict, "%s", word) != EOF)
    {
        node *n = malloc(sizeof(node));
        if (n == NULL)
        {
            return false;
        }

        strcpy(n->word, word);
        word_count++;

        // Index word using hash function
        int dict_index = hash(word);

        // Insert into hash table if already empty
        if (table[dict_index] == NULL)
        {
            n->next = NULL;
        }
        // Insert work as new node if not empyty
        else
        {
            n->next = table[dict_index];
        }

        table[dict_index] = n;
    }

    // Close dictionary file
    fclose(dict);

    // Indicate success
    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int size(void)
{
    return word_count;
    return 0;
}

// Unloads dictionary from memory, returning true if successful else false
bool unload(void)
{
    for (int i = 0; i < N; i++)
    {
        node *cursor = table[i];

        while (cursor)
        {
            node *tmp = cursor;
            cursor = cursor->next;
            free(tmp);
        }
    }
    return true;
}

推荐答案

有几个问题.

  1. 所有的字典单词都是小写的,每行一个.所以,哈希是在10个字符上完成的.
  2. 但是,支票文本可以是大写的.在check中,它对大小写混合的文本调用hash,因此它可以为某些单词(例如Won't this work?No, it won't.)生成不同的散列值
  3. check中,我们需要对调用hash的字符串before进行重命名,这样我们就得到了same的哈希值/索引.作为一个有益的副作用,我们可以将strcasecmp替换为strcmp(速度更快).
  4. load中,有no需要有一个特殊的情况,当table[n]NULL时,所以我们可以简化它.
  5. 虽然我没有判断这一点,但在hash中执行<<可能会产生一个不均匀分布的散列值,从而减慢在check中的搜索速度.
  6. 因为字典文件[保证]全是小写,并且每行有一个条目,所以在load中,我们可以用fgets[go 掉换行符]替换fscanf,因为fgets要快得多.
  7. gcc 8.3.1中,编译器标记为node *table[N];,因为:const unsigned int N = 26;与C++不同,这在C中是不允许的--它需要文字常量,而not只需要const.因此,我将其更改为enum { N = 26 };
  8. Style note:永远不会:x -> y而不是x->y.

以下是更正后的代码.我已经对照了所有可能的输入文件和词典进行了判断.它带有错误和修复的注释:

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#if 1
#include <ctype.h>
#endif

#include "dictionary.h"

// Represents a node in a hash table
typedef struct node {
    char word[LENGTH + 1];
    struct node *next;
} node;

// Number of words in dictionary
int word_count = 0;

// Number of buckets in hash table
// NOTE/BUG: C (vs C++) doesn't support global arrays from const
#if 0
const unsigned int N = 26;
#else
enum {
    N = 26
};
#endif

// Hash table
node *table[N];

// locase -- copy lower cased word
void
locase(char *dst,const char *src)
{

    for (;  *src != 0;  ++src, ++dst)
        *dst = tolower((unsigned char) *src);
    *dst = 0;
}

// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
// NOTE/FIX: we _must_ take lowercase of word _before_ calculating the hash
// otherwise, the hash value will differ for uppercase words here and lowercase
// words in the dictionary (i.e. different hash buckets)
// NOTE/FIX: we should convert test word to lowercase _once_ (it allows us to
// use strcmp below)
#if 1
    char lobuf[LENGTH + 1];
    locase(lobuf,word);
    word = lobuf;
#endif
    unsigned int n = hash(word);

    node *cursor = table[n];

    while (cursor != NULL) {
// NOTE/BUG: strcasecmp is very slow compared to strcmp
#if 0
        if (strcasecmp(word, cursor->word) == 0)
            return true;
#else
        if (strcmp(word, cursor->word) == 0)
            return true;
#endif
        cursor = cursor->next;
    }

    return false;
}

// Hashes word to a number
// Function credit to staff on CS50 reddit page
unsigned int
hash(const char *word)
{
    unsigned int hash_value = 0;

#if 0
    for (int i = 0, n = strlen(word); i < n; i++)
        hash_value = (hash_value << 2) ^ word[i];
#endif
#if 0
    for (int i = 0; word[i] != 0; i++)
        hash_value = (hash_value << 2) ^ word[i];
#endif
#if 1
    for (int i = 0; word[i] != 0; i++)
        hash_value = (hash_value * 31) ^ word[i];
#endif

    return hash_value % N;
}

// Loads dictionary into memory, returning true if successful else false
bool
load(const char *dictionary)
{
    // Open dictionary and check for memory issue
    // Function guide credit to CS50 Guide by Anvea on YouTube
    FILE *dict = fopen(dictionary, "r");
#if 0
    char word[LENGTH + 1];
#else
    char word[LENGTH + 10];
#endif

    // Check for memory issue with dict
    if (dict == NULL) {
        printf("Dictionary is null\n");
        unload();
        return false;
    }

// NOTE/BUG: dictionary is guaranteed to be one word per line and fscanf is
// slow compared to fgets
#if 0
    while (fscanf(dict, "%s", word) != EOF) {
#else
    while (1) {
        if (fgets(word,sizeof(word),dict) == NULL)
            break;
        char *cp = strchr(word,'\n');
        if (cp != NULL)
            *cp = 0;
#endif

        node *n = malloc(sizeof(node));

        if (n == NULL) {
            return false;
        }

        strcpy(n->word, word);

        word_count++;

        // Index word using hash function
        int dict_index = hash(word);

// NOTE/BUG: no need to special case this
#if 0
        // Insert into hash table if already empty
        if (table[dict_index] == NULL) {
            n->next = NULL;
        }
        // Insert work as new node if not empyty
        else {
            n->next = table[dict_index];
        }
#else
        n->next = table[dict_index];
#endif

        table[dict_index] = n;
    }

    // Close dictionary file
    fclose(dict);

    // Indicate success
    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int
size(void)
{
    return word_count;
    return 0;
}

// Unloads dictionary from memory, returning true if successful else false
bool
unload(void)
{
    for (int i = 0; i < N; i++) {

        node *cursor = table[i];

        while (cursor) {
            node *tmp = cursor;

            cursor = cursor->next;
            free(tmp);
        }
    }
    return true;
}

在上面的代码中,我使用了cpp个条件来表示旧代码和新代码:

#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

注意:这可以通过运行文件到unifdef -k来清理


以下是经过清理的版本(遍历unifdef -k,并进行一些 comments 清理):

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>

#include "dictionary.h"

// Represents a node in a hash table
typedef struct node {
    char word[LENGTH + 1];
    struct node *next;
} node;

// Number of words in dictionary
int word_count = 0;

// Number of buckets in hash table
enum {
    N = 26
};

// Hash table
node *table[N];

// locase -- copy lower cased word
void
locase(char *dst,const char *src)
{

    for (;  *src != 0;  ++src, ++dst)
        *dst = tolower((unsigned char) *src);
    *dst = 0;
}

// Returns true if word is in dictionary, else false
bool
check(const char *word)
{
    char lobuf[LENGTH + 1];
    locase(lobuf,word);
    word = lobuf;
    unsigned int n = hash(word);

    node *cursor = table[n];

    while (cursor != NULL) {
        if (strcmp(word, cursor->word) == 0)
            return true;
        cursor = cursor->next;
    }

    return false;
}

// Hashes word to a number
// Function credit to staff on CS50 reddit page
unsigned int
hash(const char *word)
{
    unsigned int hash_value = 0;

    for (int i = 0; word[i] != 0; i++)
        hash_value = (hash_value * 31) ^ word[i];

    return hash_value % N;
}

// Loads dictionary into memory, returning true if successful else false
bool
load(const char *dictionary)
{
    // Open dictionary and check for memory issue
    // Function guide credit to CS50 Guide by Anvea on YouTube
    FILE *dict = fopen(dictionary, "r");
    char word[LENGTH + 10];

    // Check for memory issue with dict
    if (dict == NULL) {
        printf("Dictionary is null\n");
        unload();
        return false;
    }

    while (1) {
        if (fgets(word,sizeof(word),dict) == NULL)
            break;
        char *cp = strchr(word,'\n');
        if (cp != NULL)
            *cp = 0;

        node *n = malloc(sizeof(node));

        if (n == NULL) {
            return false;
        }

        strcpy(n->word, word);

        word_count++;

        // Index word using hash function
        int dict_index = hash(word);

        n->next = table[dict_index];

        table[dict_index] = n;
    }

    // Close dictionary file
    fclose(dict);

    // Indicate success
    return true;
}

// Returns number of words in dictionary if loaded else 0 if not yet loaded
unsigned int
size(void)
{
    return word_count;
    return 0;
}

// Unloads dictionary from memory, returning true if successful else false
bool
unload(void)
{
    for (int i = 0; i < N; i++) {

        node *cursor = table[i];

        while (cursor) {
            node *tmp = cursor;

            cursor = cursor->next;
            free(tmp);
        }
    }
    return true;
}

虽然上面的代码[现在]是正确的,但它还可以进一步改进.

有关其他修复和加速的信息,请参阅我的CS50拼写测试答案:cs50 pset5 Speller optimisation

以下是您的[FIXED]版本与我的[OPTIMIZED]版本之间check函数运行时间的比较:

 Yours       Mine     File
 27.900433   0.271936 aca.txt
  8.713822   0.093062 austen.txt
  1.553954   0.015378 birdman.txt
  3.996230   0.041159 burnett.txt
  1.927505   0.021139 carroll.txt
  0.001437   0.000007 cat.txt
  0.477209   0.005412 constitution.txt
 12.684350   0.141040 federalist.txt
  5.947873   0.060730 frankenstein.txt
  6.810451   0.081574 grimm.txt
  1.279325   0.013508 her.txt
 78.311622   0.861220 holmes.txt
 14.265868   0.145593 homer.txt
  1.297946   0.013600 lalaland.txt
  4.110416   0.042714 mansfield.txt
  0.002796   0.000017 pneumonoultramicroscopicsilicovolcanoconiosis.txt
  1.739467   0.017283 revenant.txt
  5.238748   0.054731 rinehart.txt
 68.048034   0.680697 shakespeare.txt
  6.071052   0.064508 stein.txt
 11.721317   0.121086 stoker.txt
 14.137166   0.146902 surgery.txt
 40.615153   0.433262 tolstoy.txt
  9.731160   0.099112 wells.txt
 39.266734   0.457958 whittier.txt
  0.014416   0.000132 wordsworth.txt
 13.774792   0.144299 xueqin1.txt
 18.768864   0.196494 xueqin2.txt

C++相关问答推荐

理解C中的指针定义

如何判断宏参数是否为C语言中的整型文字

如何使用Python C API实现多线程程序?

将 typewriter LF打印到Windows终端,而不是隐含的CR+LF

在没有动态内存分配的情况下,用C语言最快地将各种数组复制到单个较大的数组中

如何将长字符串转换为较小的缩写,该缩写由第一个字符、最后一个字符和中间的字符数组成?

C语言编译阶段与翻译阶段的关系

Setenv在c编程中的用法?

GCC不顾-fno-Builtin-SINCOS旗帜向SINCOS发出呼唤

用gcc-msse 2编译的C程序包含AVX 1指令

处理来自浏览器的HTTP请求

预处理器宏扩展(ISO/IEC 9899:1999(E)§;6.10.3.5示例3)

Tic-tac-toe:从文件加载存储

有没有一种方法可以用C创建保留限定符的函数?

从BIOS(8086)中读取刻度需要多少?

使用正则表达式获取字符串中标记的开始和结束

为什么二进制文件的大小不会随着静态数据的大小而增加?

如何在不更改格式说明符的情况下同时支持双精度和长双精度?

Makefile无法将代码刷新到ATmega328p

我可以使用Windows SDK';s IN6_IS_ADDR_LOOPBACK等,尽管没有文档?