| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | import re | 
					
						
							| 
									
										
										
										
											2024-04-11 14:45:58 +00:00
										 |  |  | import sys | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | from pathlib import Path | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-25 11:55:39 +00:00
										 |  |  | import yaml | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | from jinja2 import Environment | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-25 11:55:39 +00:00
										 |  |  | rules_src = Path(__file__).with_name("rules.yaml") | 
					
						
							| 
									
										
										
										
											2024-04-11 14:45:58 +00:00
										 |  |  | if not rules_src.exists(): | 
					
						
							|  |  |  |     # This skip is usefull mostly for CI operations when installing only Python deps | 
					
						
							|  |  |  |     print("Skipping rules generation, rule file is missing") | 
					
						
							|  |  |  |     sys.exit() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-25 11:55:39 +00:00
										 |  |  | FUZZY_RULES = yaml.safe_load(rules_src.read_text())["fuzzyRules"] | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | PY2JS_RULE_RX = re.compile(r"\\(\d)", re.ASCII) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Do not escape anything, we want to generate code as-is, it won't be interpreted as | 
					
						
							|  |  |  | # HTML anyway | 
					
						
							|  |  |  | JINJA_ENV = Environment(autoescape=False)  # noqa: S701 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | js_code_template = """// THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | export const fuzzyRules = [ | 
					
						
							|  |  |  | {% for rule in FUZZY_RULES %}  { | 
					
						
							| 
									
										
										
										
											2024-04-12 09:07:09 +00:00
										 |  |  |     match: '{{ rule['match'] }}', | 
					
						
							|  |  |  |     replace: '{{ rule['replace'] }}', | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  |   }, | 
					
						
							|  |  |  | {% endfor %} | 
					
						
							|  |  |  | ]; | 
					
						
							| 
									
										
										
										
											2024-04-12 09:07:09 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-29 20:52:24 +00:00
										 |  |  | js_parent = Path(__file__).joinpath("../../javascript/src").resolve() | 
					
						
							| 
									
										
										
										
											2024-04-11 14:45:58 +00:00
										 |  |  | if not js_parent.exists(): | 
					
						
							|  |  |  |     # This skip is usefull mostly for CI operations when working on the Python part | 
					
						
							|  |  |  |     print("Skipping JS rules generation, target folder is missing") | 
					
						
							|  |  |  | else: | 
					
						
							| 
									
										
										
										
											2024-04-29 20:52:24 +00:00
										 |  |  |     (js_parent / "fuzzyRules.js").write_text( | 
					
						
							| 
									
										
										
										
											2024-04-11 14:45:58 +00:00
										 |  |  |         JINJA_ENV.from_string(js_code_template).render( | 
					
						
							|  |  |  |             FUZZY_RULES=[ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "match": rule["pattern"].replace("\\", "\\\\"), | 
					
						
							|  |  |  |                     "replace": PY2JS_RULE_RX.sub(r"$\1", rule["replace"]), | 
					
						
							|  |  |  |                 } | 
					
						
							|  |  |  |                 for rule in FUZZY_RULES | 
					
						
							|  |  |  |             ] | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-06-04 07:15:51 +00:00
										 |  |  |     print("JS rules generation completed successfully") | 
					
						
							| 
									
										
										
										
											2024-04-09 06:22:01 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | py_code_template = """# THIS IS AN AUTOMATICALLY GENERATED FILE, DO NOT MODIFY DIRECTLY
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | FUZZY_RULES = [ | 
					
						
							|  |  |  | {% for rule in FUZZY_RULES %}  { | 
					
						
							|  |  |  |     "pattern": r"{{ rule['pattern'] }}", | 
					
						
							|  |  |  |     "replace": r"{{ rule['replace'] }}", | 
					
						
							|  |  |  |   }, | 
					
						
							|  |  |  | {% endfor %} | 
					
						
							|  |  |  | ] | 
					
						
							|  |  |  | """
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-29 20:52:24 +00:00
										 |  |  | py_parent = Path(__file__).joinpath("../../src/warc2zim").resolve() | 
					
						
							| 
									
										
										
										
											2024-04-11 14:45:58 +00:00
										 |  |  | if not py_parent.exists(): | 
					
						
							|  |  |  |     # This skip is usefull mostly for CI operations when working on the JS part | 
					
						
							|  |  |  |     print("Skipping Python rules generation, target folder is missing") | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  |     (py_parent / "rules.py").absolute().write_text( | 
					
						
							|  |  |  |         JINJA_ENV.from_string(py_code_template).render(FUZZY_RULES=FUZZY_RULES) | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-06-04 07:15:51 +00:00
										 |  |  |     print("Python rules generation completed successfully") |