mirror of
https://github.com/python/cpython.git
synced 2026-06-04 16:50:51 +00:00
This commit is contained in:
parent
310fe88994
commit
f0daba1652
4 changed files with 19 additions and 63 deletions
|
|
@ -5,7 +5,7 @@
|
|||
typedef struct {
|
||||
PyObject_VAR_HEAD
|
||||
Py_DEPRECATED(3.11) Py_hash_t ob_shash;
|
||||
unsigned char ob_sval[1];
|
||||
char ob_sval[1];
|
||||
|
||||
/* Invariants:
|
||||
* ob_sval contains space for 'ob_size+1' elements.
|
||||
|
|
@ -20,7 +20,7 @@ PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
|
|||
#define _PyBytes_CAST(op) \
|
||||
(assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op))
|
||||
|
||||
static inline unsigned char* PyBytes_AS_STRING(PyObject *op)
|
||||
static inline char* PyBytes_AS_STRING(PyObject *op)
|
||||
{
|
||||
return _PyBytes_CAST(op)->ob_sval;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -646,23 +646,26 @@ def test_group_without_user_agent(self):
|
|||
)
|
||||
class BaseLocalNetworkTestCase:
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
def setUp(self):
|
||||
# clear _opener global variable
|
||||
cls.addClassCleanup(urllib.request.urlcleanup)
|
||||
self.addCleanup(urllib.request.urlcleanup)
|
||||
|
||||
cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
|
||||
cls.addClassCleanup(cls.server.server_close)
|
||||
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
|
||||
|
||||
t = threading.Thread(
|
||||
self.t = threading.Thread(
|
||||
name='HTTPServer serving',
|
||||
target=cls.server.serve_forever,
|
||||
target=self.server.serve_forever,
|
||||
# Short poll interval to make the test finish quickly.
|
||||
# Time between requests is short enough that we won't wake
|
||||
# up spuriously too many times.
|
||||
kwargs={'poll_interval':0.01})
|
||||
cls.enterClassContext(threading_helper.start_threads([t]))
|
||||
cls.addClassCleanup(cls.server.shutdown)
|
||||
self.t.daemon = True # In case this function raises.
|
||||
self.t.start()
|
||||
|
||||
def tearDown(self):
|
||||
self.server.shutdown()
|
||||
self.t.join()
|
||||
self.server.server_close()
|
||||
|
||||
|
||||
SAMPLE_ROBOTS_TXT = b'''\
|
||||
|
|
@ -684,6 +687,7 @@ def do_GET(self):
|
|||
def log_message(self, format, *args):
|
||||
pass
|
||||
|
||||
@threading_helper.reap_threads
|
||||
def testRead(self):
|
||||
# Test that reading a weird robots.txt doesn't fail.
|
||||
addr = self.server.server_address
|
||||
|
|
@ -705,21 +709,17 @@ def testRead(self):
|
|||
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
|
||||
|
||||
|
||||
class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
|
||||
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
|
||||
class RobotHandler(BaseHTTPRequestHandler):
|
||||
|
||||
def do_GET(self):
|
||||
self.send_error(self.server.return_code)
|
||||
self.send_error(403, "Forbidden access")
|
||||
|
||||
def log_message(self, format, *args):
|
||||
pass
|
||||
|
||||
def setUp(self):
|
||||
# Make sure that a valid code is set in the test.
|
||||
self.server.return_code = None
|
||||
|
||||
@threading_helper.reap_threads
|
||||
def testPasswordProtectedSite(self):
|
||||
self.server.return_code = 403
|
||||
addr = self.server.server_address
|
||||
url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
|
||||
robots_url = url + "/robots.txt"
|
||||
|
|
@ -727,40 +727,6 @@ def testPasswordProtectedSite(self):
|
|||
parser.set_url(url)
|
||||
parser.read()
|
||||
self.assertFalse(parser.can_fetch("*", robots_url))
|
||||
self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
|
||||
|
||||
def testNotFound(self):
|
||||
self.server.return_code = 404
|
||||
addr = self.server.server_address
|
||||
url = f'http://{socket_helper.HOST}:{addr[1]}'
|
||||
robots_url = url + "/robots.txt"
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
parser.set_url(url)
|
||||
parser.read()
|
||||
self.assertTrue(parser.can_fetch("*", robots_url))
|
||||
self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
|
||||
|
||||
def testTeapot(self):
|
||||
self.server.return_code = 418
|
||||
addr = self.server.server_address
|
||||
url = f'http://{socket_helper.HOST}:{addr[1]}'
|
||||
robots_url = url + "/robots.txt"
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
parser.set_url(url)
|
||||
parser.read()
|
||||
self.assertTrue(parser.can_fetch("*", robots_url))
|
||||
self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
|
||||
|
||||
def testServiceUnavailable(self):
|
||||
self.server.return_code = 503
|
||||
addr = self.server.server_address
|
||||
url = f'http://{socket_helper.HOST}:{addr[1]}'
|
||||
robots_url = url + "/robots.txt"
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
parser.set_url(url)
|
||||
parser.read()
|
||||
self.assertFalse(parser.can_fetch("*", robots_url))
|
||||
self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
|
||||
|
||||
|
||||
@support.requires_working_socket()
|
||||
|
|
|
|||
|
|
@ -65,17 +65,9 @@ def read(self):
|
|||
f = urllib.request.urlopen(self.url)
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code in (401, 403):
|
||||
# If access to robot.txt has the status Unauthorized/Forbidden,
|
||||
# then most likely this applies to the entire site.
|
||||
self.disallow_all = True
|
||||
elif 400 <= err.code < 500:
|
||||
# RFC 9309, Section 2.3.1.3: the crawler MAY access any
|
||||
# resources on the server.
|
||||
elif err.code >= 400 and err.code < 500:
|
||||
self.allow_all = True
|
||||
elif 500 <= err.code < 600:
|
||||
# RFC 9309, Section 2.3.1.4: the crawler MUST assume
|
||||
# complete disallow.
|
||||
self.disallow_all = True
|
||||
err.close()
|
||||
else:
|
||||
raw = f.read()
|
||||
|
|
|
|||
|
|
@ -1,2 +0,0 @@
|
|||
Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
|
||||
is unreachable due to server or network errors.
|
||||
Loading…
Add table
Add a link
Reference in a new issue