C++ 对TXT 的串并行读写

任务说明：有36篇文档，现在要读入，并统计词频，字典长度25，希望能够比较串并行读写操作的时间差距。

串行读入并统计词频

// LoadDocsInUbuntu.cpp

//

#include <iostream>

#include <stdio.h>

#include <vector>

using namespace std;

int main()

{

    char filename[100];

    size_t d;

    FILE *fileptr;

    int word;

    vector< vector<int> > corpus;

    printf("load data ...\n");

    for (d = 1; d < 37; d++){

        sprintf(filename, "..//data/doc_%d.txt", d);

        fileptr = fopen(filename, "r");

        vector<int> doc;

        int ff[25] = { 0 };

        while (fscanf(fileptr, "%d", &word) != EOF)

        {

            ff[word - 1] = ff[word - 1] + 1;

            doc.push_back(word);

        }

        corpus.push_back(doc);

        fclose(fileptr);

        sprintf(filename, "..//result/freqUbuntuSerial_%d.txt", d);

        fileptr = fopen(filename, "w");

        for (int f = 0; f < 25; f++)

        {    

            fprintf(fileptr, "%d ", ff[f]);

        }

        fclose(fileptr);

    }

    cout <<"corpus.size()="<< corpus.size() << endl;

    return 0;

}

这里讨论并行有三种思路：一，按照文档序号进行分组读入统计等操作；二，在文档内按单词数目分组进行统计；三，将统计与读写操作并行处理。

针对第一种思路，使用openmp做多线程处理：

// LoadDocsByOpenMP.cpp

//

#include <omp.h>

#include <iostream>

#include <stdio.h>

#include <vector>

#include <stdlib.h>

#include <time.h>

#include <string>

using namespace std;

int main()

{

	char filename[100],resultname[100];

	int d;

	FILE *fileptr[360];

	int word;

	int ff[360][25] = { 0 };

	//vector< vector<int> > corpus;

	clock_t start,finish;

	int f[360]={0};

	start=clock();

	printf("load data ...\n");

#pragma omp parallel for num_threads(4)

	for (d = 1; d < 361; d++){

		printf("Hello world, I am %d, docs index %d.\n",omp_get_thread_num(),d);

		sprintf(filename, "..//data/doc_%d.txt", d);

		fileptr[d-1] = fopen(filename, "r");

		//int ff[25]={0};

		////vector<int> doc;

		while (fscanf(fileptr[d-1], "%d", &word) != EOF)

		{

			ff[d-1][word - 1] = ff[d-1][word - 1] + 1;

			//ff[word-1]=ff[word-1]+1;

		//	//doc.push_back(word);

		}

		////corpus.push_back(doc);

		fclose(fileptr[d-1]);

		sprintf(resultname, "..//result/freqByOpenMP_%d.txt", d);//Be CAREFUL!For the name "filename" has been used before, we must name the string differently here.

		fileptr[d-1] = fopen(resultname, "w");

		for (f[d-1] = 0; f[d-1] < 25; f[d-1]++)

		{

			fprintf(fileptr[d-1], "%d ", ff[f[d-1]]);

		}

		fclose(fileptr[d-1]);

	}

	//cout <<"corpus.size()="<< corpus.size() << endl;

	finish=clock();

	cout<<"time cost : "<< (double)(finish-start)/ CLOCKS_PER_SEC<<endl;

	return 0;

}

但初步比较openmp对串行读取的速度并没有太多提升，反而是当进程数多于系统物理核数的时候，程序时间会加长。

另外两种实现思路在后续学习中继续实现。

巴特西

C++ 对TXT 的串并行读写

最新文章

热门文章