我有一个文件夹,里面还有其他几个文件夹.这些文件夹都包含TalkBank Childes XML文件
我已经编写了将数据提取到Pandas 数据框中的代码.这段代码的工作方式是要求用户输入包含XML文件的文件夹的目录,但我希望它的工作方式是,用户在任何时候想要提取XML文件时都不必一直输入目录.我可以这样编写吗?我可以在代码中指定所有文件目录,并在运行.py文件后从所有文件夹中提取XML.
import nltk
import os
import pandas as pd
from lxml import etree
from nltk.corpus.reader import CHILDESCorpusReader
# Function to get user input for the directory
def get_input_directory():
dir_childes_corpus = input("Enter the directory containing CHILDES XML files: ")
return dir_childes_corpus
# Path containing CHILDES XML files
dir_childes_corpus = get_input_directory()
# Empty lists to store speaker and utterance data
speakers_data = [ ]
utterance_data = [ ]
# Define namespaces
namespaces = {
'tb': 'http://www.talkbank.org/ns/talkbank',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}
for filename in os.listdir(dir_childes_corpus):
if filename.endswith('.xml'):
file_path = os.path.join(dir_childes_corpus, filename)
# Parse XML data
tree = etree.parse(file_path)
# Extract dialogue_id from folder name and filename
folder_name = os.path.basename(os.path.dirname(file_path))
dialogue_id = f"{folder_name}/{os.path.splitext(filename)[0]}.xml"
# Extract participant information
speakers_info = []
for participant in tree.xpath("//tb:Participants/tb:participant", namespaces=namespaces):
speaker_info = {
'dialogue_id': dialogue_id,
'speaker_id': participant.get('id'),
'speaker_name': participant.get('name'),
'role': participant.get('role'),
'age': participant.get('age'),
'sex': participant.get('sex')
}
speakers_info.append(speaker_info)
df_speakers = pd.DataFrame(speakers_info)
# Extract utterance information including morphemes count
utts_info = []
for utt in tree.xpath("//tb:u", namespaces=namespaces):
speaker = utt.get('who')
uID = utt.get('uID') # Extract uID
utterance_text = ' '.join(utt.xpath(".//tb:w/text()", namespaces=namespaces))
# Count morphemes
utterance_length = len(utt.xpath(".//tb:w/tb:mor", namespaces=namespaces))
utt_info = {
'dialogue_id': dialogue_id,
'uID': uID,
'speaker': speaker,
'utterance': utterance_text,
'utterance_length': utterance_length
}
utts_info.append(utt_info)
df_utts = pd.DataFrame(utts_info)
# Append data to lists
speakers_data.append(df_speakers)
utterance_data.append(df_utts)
# Concatenate dataframes
speakers_data = pd.concat(speakers_data, ignore_index=True)
utterance_data = pd.concat(utterance_data, ignore_index=True)