以下是您问题中代码的更新,我相信这将满足您的要求:
# uses python-docx package
import docx
import os
# uses pywin32 package
import win32com.client as win32
from win32com.client import constants
app = win32.gencache.EnsureDispatch('Word.Application')
charCounts = {}
fileDir = '.' # Put the path of the directory to be searched here
os.chdir(fileDir)
cwd = os.getcwd()
directory = os.fsencode(cwd)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.startswith('TEMP_CONVERTED_WORD_FILE_'):
continue
filenameOrig = None
if filename.endswith(".doc"):
filenameOrig = filename
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = app.Documents.Open(src_path_norm)
doc.Activate()
docxPath = 'TEMP_CONVERTED_WORD_FILE_' + filename[:-4] + ".docx"
dest_path = os.path.join(cwd, docxPath)
dest_path_norm = os.path.normpath(dest_path)
app.ActiveDocument.SaveAs(dest_path_norm, FileFormat=constants.wdFormatXMLDocument)
doc.Close(False)
filename = docxPath
if filename.endswith(".docx"):
src_path = os.path.join(cwd, filename)
src_path_norm = os.path.normpath(src_path)
doc = docx.Document(src_path_norm)
chars = sum(len(p.text) for p in doc.paragraphs) + sum(len(p.text) for section in doc.sections for hf in [section.header, section.footer] for p in hf.paragraphs)
charCounts[filenameOrig if filenameOrig else filename] = chars / 65
charCounts = {k:charCounts[k] for k in sorted(charCounts)}
# uses openpyxl package
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.cell(row=1, column=2, value='File Name')
ws.cell(row=1, column=4, value='chars/65')
for i, x in enumerate(charCounts):
ws.cell(row=i + 3, column=2, value=x[:-4] if x.endswith('.doc') else x[:-5])
ws.cell(row=i + 3, column=4, value=charCounts[x])
ws.cell(row=len(charCounts) + 3, column=3, value='Total')
ws.cell(row=len(charCounts) + 3, column=4, value=sum(charCounts.values()))
path = './charCounts.xlsx'
wb.save(path)
说明:
- 对于名称以
.docx
结尾的每个文件(以TEMP_CONVERTED_WORD_FILE_
开头的文件除外),将字符数(除以65)除以文件名作为键存储在字典charCount
中
- 对于每个以
.doc
结尾的文件,使用Win32 extensions的pywin32
包将其转换为.docx
文件,文件名前加TEMP_CONVERTED_WORD_FILE_
,然后将字符数(除以65)与其原始文件名作为键存储在上述同一字典中
- 将
charCounts
字典替换为按文件名键具有插入顺序的字典
- 遍历
charCounts
将内容存储在Excel文件中,注意截断文件名键的.doc
或.docx
后缀.