24h/cleanup-1.py
... ...
@@ -0,0 +1,43 @@
1
+import os
2
+import re
3
+
4
+def clean_content(text):
5
+ # Split on "Autor:" and take only the content before it
6
+ text = text.split("Autor:")[0].strip()
7
+
8
+ # Remove all HTML tags and their content
9
+ cleaned = re.sub(r'<[^>]+>|\&[^;]+;', '', text)
10
+
11
+ # Remove empty lines while preserving important section headers
12
+ lines = [line.strip() for line in cleaned.split('\n')]
13
+ important_sections = ['Myśl przewodnia AA', 'Pytanie', 'Medytacja', 'Modlitwa']
14
+ filtered_lines = []
15
+
16
+ for line in lines:
17
+ if line and (line in important_sections or filtered_lines):
18
+ filtered_lines.append(line)
19
+
20
+ return '\n\n'.join(filtered_lines)
21
+
22
+def process_file(filepath):
23
+ try:
24
+ with open(filepath, 'r', encoding='utf-8') as file:
25
+ content = file.read()
26
+
27
+ cleaned_content = clean_content(content)
28
+
29
+ with open(filepath, 'w', encoding='utf-8') as file:
30
+ file.write(cleaned_content)
31
+ print(f"Successfully processed: {filepath}")
32
+ except Exception as e:
33
+ print(f"Error processing {filepath}: {str(e)}")
34
+
35
+def main():
36
+ folder_path = os.path.dirname(os.path.abspath(__file__))
37
+ for filename in os.listdir(folder_path):
38
+ if filename.endswith('.txt'):
39
+ file_path = os.path.join(folder_path, filename)
40
+ process_file(file_path)
41
+
42
+if __name__ == "__main__":
43
+ main()
... ...
\ No newline at end of file