diff --git a/.gitignore b/.gitignore index 970e920..48ef227 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ ### PyCharm ### .idea/ .idea* +__pycache__/ +*.pyc ### Visual Studio Code ### .vscode/* diff --git a/main.py b/main.py index 99f2e28..87a121d 100644 --- a/main.py +++ b/main.py @@ -1,29 +1,32 @@ -from time import sleep import requests from bs4 import BeautifulSoup +from utils import control_target, manage_target, is_404 pages = [] +TARGET_URL = "http://localhost:5500" - -def get_links(directory, url): +def get_links(url): global pages - try: - html = requests.get(f'{url}{directory}').text - soup = BeautifulSoup(html, "html.parser") - for link in soup.find_all("a"): - if "href" in link.attrs: - if link.attrs["href"] not in pages: - new_page = link.attrs["href"] - pages.append(new_page) - get_links(new_page, url) - except: - sleep(1) - + + html = requests.get(f'{url}').text + + soup = BeautifulSoup(html, "html.parser") + for link in soup.find_all("a"): + if "href" in link.attrs: + potential_page = manage_target(TARGET_URL, link.attrs["href"]) + if (potential_page not in pages) and (control_target(TARGET_URL, potential_page)) and (not is_404(potential_page)): + print(url, " ~~> ", potential_page) + pages.append(potential_page) + get_links(potential_page) + def main(): - get_links("", "http://localhost:5500/") - print("\n".join(pages)) + get_links(TARGET_URL) + #print("\n".join(pages)) if __name__ == "__main__": - main() + try: + main() + except KeyboardInterrupt: + print('bye!!') diff --git a/test/index.xml b/test/index.xml index 32f5913..6d0df3f 100644 --- a/test/index.xml +++ b/test/index.xml @@ -490,7 +490,7 @@ app.delete() // delete request, to delete data <p>In today&rsquo;s world, getting distracted is super easy. Also, for a big chunk of young people, doing something is worthless. And I want to disagree with that. We&rsquo;re human because we can do whatever we want. We can build softwares, buildings, and new technologies. To move humanity forward, we have to do.</p> <p>For myself, I&rsquo;m currently a student (btw, I don&rsquo;t like university education 😕), and because of the pandemic, we have online lessons. I don&rsquo;t have to go to school every day, and I don&rsquo;t have to go by bus with all the people on it. For me, things are going well, and I can code more, learn more and create more.</p> <p>Ok, but how to create more?</p> -<p>Well, as I understood, you have to force yourself to create something every day. Currently, I&rsquo;m writing on my <a href="http://localhost:5500/i-decided-to-write-everyday/">blog</a> every morning, and I feel better. I started to post more on my social media accounts, and they&rsquo;re not food or travel photos. I&rsquo;m trying to showcase my work and create value for other people. As a result, I started to get more followers (you can check out the <a href="http://localhost:5500/contact/">contact</a> page to follow me 😉).</p> +<p>Well, as I understood, you have to force yourself to create something every day. Currently, I&rsquo;m writing on my <a href="http://localhost:5500/blog/i-decided-to-write-everyday/">blog</a> every morning, and I feel better. I started to post more on my social media accounts, and they&rsquo;re not food or travel photos. I&rsquo;m trying to showcase my work and create value for other people. As a result, I started to get more followers (you can check out the <a href="http://localhost:5500/contact/">contact</a> page to follow me 😉).</p> <p>Try to find a place with productive people for me; I&rsquo;m traveling around <a href="https://www.indiehackers.com/">IndieHackers</a>, <a href="https://www.producthunt.com/">ProductHunt</a>, Twitter, and Instagram. For Twitter and Instagram, you have to limit yourself to specific people because a lot of people are just doing nothing, and your time is valuable, so be careful.</p> <p>Also, don&rsquo;t care too much about what other people tell you. You know yourself better than anybody. Try new things to get more creative and be consistent.</p> diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..7336e41 --- /dev/null +++ b/utils.py @@ -0,0 +1,17 @@ +import requests + + +def control_target(TARGET_URL, test_url): + return True if test_url.startswith(TARGET_URL) else False + +def manage_target(TARGET_URL, target): + if target.startswith("/"): + target = TARGET_URL + target + return target + +def is_404(url): + r = requests.get(f'{url}') + return True if r.status_code == 404 else False + +def robotsChecker(): + pass \ No newline at end of file