2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import child_process from "child_process";
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import fs from "fs";
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import yaml from "js-yaml";
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								function runCrawl(name, config, commandExtra = "") {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  config.generateCDX = true;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  config.depth = 0;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  config.collection = name;
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  const configYaml = yaml.dump(config);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  try {
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    const proc = child_process.execSync(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      `docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								      { input: configYaml, stdin: "inherit", encoding: "utf8" },
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    );
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-06-26 13:05:13 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    //console.log(proc);
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								  } catch (error) {
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    console.log(error);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  }
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								function doesCDXContain(coll, value) {
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								  const data = fs.readFileSync(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    `test-crawls/collections/${coll}/indexes/index.cdxj`,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  );
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  return data.indexOf(value) >= 0;
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								}
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-10-09 09:41:50 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								// Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								/*
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								test("test crawl without ad block for specific URL", () => {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  const config = {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    "url": "https://www.mozilla.org/en-US/firefox/",
							 | 
						
					
						
							
								
									
										
										
										
											2023-10-02 14:30:44 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    "pageExtraDelay": 10
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  runCrawl("adblock-no-block", config);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  // without ad blocking, URL with googletagmanager is included
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								});
							 | 
						
					
						
							
								
									
										
										
										
											2023-10-09 09:41:50 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								*/
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								test("testcrawl with ad block for specific URL", () => {
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  const config = {
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    url: "https://www.mozilla.org/en-US/firefox/",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    blockAds: true,
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  };
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  runCrawl("adblock-block", config);
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-09 19:11:11 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								  expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    false,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								  );
							 | 
						
					
						
							
								
									
										
										
										
											2022-10-25 10:53:32 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								});
							 |