Crawl a website with cli is ok

ZekeriyaAY · Nov 6, 2022 · d4090cc · d4090cc
1 parent c003805
commit d4090cc
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 ### PyCharm ###
 .idea/
 .idea*
+__pycache__/
+*.pyc
 
 ### Visual Studio Code ###
 .vscode/*

diff --git a/main.py b/main.py
@@ -1,29 +1,32 @@
-from time import sleep
 import requests
 from bs4 import BeautifulSoup
+from utils import control_target, manage_target, is_404
 
 pages = []
+TARGET_URL = "http://localhost:5500"
 
-
-def get_links(directory, url):
+def get_links(url):
     global pages
-    try:
-        html = requests.get(f'{url}{directory}').text
-        soup = BeautifulSoup(html, "html.parser")
-        for link in soup.find_all("a"):
-            if "href" in link.attrs:
-                if link.attrs["href"] not in pages:
-                    new_page = link.attrs["href"]
-                    pages.append(new_page)
-                    get_links(new_page, url)
-    except:
-        sleep(1)
-    
+
+    html = requests.get(f'{url}').text
+
+    soup = BeautifulSoup(html, "html.parser")
+    for link in soup.find_all("a"):
+        if "href" in link.attrs:
+            potential_page = manage_target(TARGET_URL, link.attrs["href"])
+            if (potential_page not in pages) and (control_target(TARGET_URL, potential_page)) and (not is_404(potential_page)):
+                print(url, " ~~> ", potential_page)
+                pages.append(potential_page)
+                get_links(potential_page)
+
 
 def main():
-    get_links("", "http://localhost:5500/")
-    print("\n".join(pages))
+    get_links(TARGET_URL)
+    #print("\n".join(pages))
 
 
 if __name__ == "__main__":
-    main()
+    try:
+        main()
+    except KeyboardInterrupt:
+        print('bye!!')
diff --git a/test/index.xml b/test/index.xml
@@ -490,7 +490,7 @@ app.delete() // delete request, to delete data
 <description>&lt;p&gt;In today&amp;rsquo;s world, getting distracted is super easy. Also, for a big chunk of young people, doing something is worthless. And I want to disagree with that. We&amp;rsquo;re human because we can do whatever we want. We can build softwares, buildings, and new technologies. To move humanity forward, we have to do.&lt;/p&gt;
 &lt;p&gt;For myself, I&amp;rsquo;m currently a student (btw, I don&amp;rsquo;t like university education 😕), and because of the pandemic, we have online lessons. I don&amp;rsquo;t have to go to school every day, and I don&amp;rsquo;t have to go by bus with all the people on it. For me, things are going well, and I can code more, learn more and create more.&lt;/p&gt;
 &lt;p&gt;Ok, but how to create more?&lt;/p&gt;
-&lt;p&gt;Well, as I understood, you have to force yourself to create something every day. Currently, I&amp;rsquo;m writing on my &lt;a href=&#34;http://localhost:5500/i-decided-to-write-everyday/&#34;&gt;blog&lt;/a&gt; every morning, and I feel better. I started to post more on my social media accounts, and they&amp;rsquo;re not food or travel photos. I&amp;rsquo;m trying to showcase my work and create value for other people. As a result, I started to get more followers (you can check out the &lt;a href=&#34;http://localhost:5500/contact/&#34;&gt;contact&lt;/a&gt; page to follow me 😉).&lt;/p&gt;
+&lt;p&gt;Well, as I understood, you have to force yourself to create something every day. Currently, I&amp;rsquo;m writing on my &lt;a href=&#34;http://localhost:5500/blog/i-decided-to-write-everyday/&#34;&gt;blog&lt;/a&gt; every morning, and I feel better. I started to post more on my social media accounts, and they&amp;rsquo;re not food or travel photos. I&amp;rsquo;m trying to showcase my work and create value for other people. As a result, I started to get more followers (you can check out the &lt;a href=&#34;http://localhost:5500/contact/&#34;&gt;contact&lt;/a&gt; page to follow me 😉).&lt;/p&gt;
 &lt;p&gt;Try to find a place with productive people for me; I&amp;rsquo;m traveling around &lt;a href=&#34;https://www.indiehackers.com/&#34;&gt;IndieHackers&lt;/a&gt;, &lt;a href=&#34;https://www.producthunt.com/&#34;&gt;ProductHunt&lt;/a&gt;, Twitter, and Instagram. For Twitter and Instagram, you have to limit yourself to specific people because a lot of people are just doing nothing, and your time is valuable, so be careful.&lt;/p&gt;
 &lt;p&gt;Also, don&amp;rsquo;t care too much about what other people tell you. You know yourself better than anybody. Try new things to get more creative and be consistent.&lt;/p&gt;
 </description>

diff --git a/utils.py b/utils.py
@@ -0,0 +1,17 @@
+import requests
+
+
+def control_target(TARGET_URL, test_url):
+    return True if test_url.startswith(TARGET_URL) else False
+
+def manage_target(TARGET_URL, target):
+    if target.startswith("/"):
+        target = TARGET_URL + target
+    return target
+
+def is_404(url):
+    r = requests.get(f'{url}')
+    return True if r.status_code == 404 else False
+
+def robotsChecker():
+    pass