f325df7dc70ff7cf1e62f32ca44d1982b08a60e6
24h/cleanup-1.py
... | ... | @@ -0,0 +1,43 @@ |
1 | +import os |
|
2 | +import re |
|
3 | + |
|
4 | +def clean_content(text): |
|
5 | + # Split on "Autor:" and take only the content before it |
|
6 | + text = text.split("Autor:")[0].strip() |
|
7 | + |
|
8 | + # Remove all HTML tags and their content |
|
9 | + cleaned = re.sub(r'<[^>]+>|\&[^;]+;', '', text) |
|
10 | + |
|
11 | + # Remove empty lines while preserving important section headers |
|
12 | + lines = [line.strip() for line in cleaned.split('\n')] |
|
13 | + important_sections = ['Myśl przewodnia AA', 'Pytanie', 'Medytacja', 'Modlitwa'] |
|
14 | + filtered_lines = [] |
|
15 | + |
|
16 | + for line in lines: |
|
17 | + if line and (line in important_sections or filtered_lines): |
|
18 | + filtered_lines.append(line) |
|
19 | + |
|
20 | + return '\n\n'.join(filtered_lines) |
|
21 | + |
|
22 | +def process_file(filepath): |
|
23 | + try: |
|
24 | + with open(filepath, 'r', encoding='utf-8') as file: |
|
25 | + content = file.read() |
|
26 | + |
|
27 | + cleaned_content = clean_content(content) |
|
28 | + |
|
29 | + with open(filepath, 'w', encoding='utf-8') as file: |
|
30 | + file.write(cleaned_content) |
|
31 | + print(f"Successfully processed: {filepath}") |
|
32 | + except Exception as e: |
|
33 | + print(f"Error processing {filepath}: {str(e)}") |
|
34 | + |
|
35 | +def main(): |
|
36 | + folder_path = os.path.dirname(os.path.abspath(__file__)) |
|
37 | + for filename in os.listdir(folder_path): |
|
38 | + if filename.endswith('.txt'): |
|
39 | + file_path = os.path.join(folder_path, filename) |
|
40 | + process_file(file_path) |
|
41 | + |
|
42 | +if __name__ == "__main__": |
|
43 | + main() |
|
... | ... | \ No newline at end of file |