Skip to content

Commit

Permalink
Merge pull request #182 from tukcomCD2024/Dev-backend
Browse files Browse the repository at this point in the history
Dev backend
  • Loading branch information
yeonjy authored Aug 2, 2024
2 parents 2cc72d0 + 72adcc8 commit 08388e3
Show file tree
Hide file tree
Showing 12 changed files with 325 additions and 131 deletions.
2 changes: 2 additions & 0 deletions backend/core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ dependencies {
implementation 'org.springframework.boot:spring-boot-starter-data-redis:2.3.1.RELEASE'
implementation 'org.springframework.boot:spring-boot-starter-webflux'
implementation 'com.fasterxml.jackson.core:jackson-core:2.17.0'
implementation 'org.springframework.boot:spring-boot-starter-batch'

implementation "com.querydsl:querydsl-jpa:${queryDslVersion}:jakarta"
annotationProcessor "com.querydsl:querydsl-apt:${queryDslVersion}:jakarta"
Expand All @@ -62,6 +63,7 @@ dependencies {
testImplementation 'org.springframework.security:spring-security-test'
testImplementation 'org.springframework.restdocs:spring-restdocs-mockmvc'
testImplementation 'io.rest-assured:rest-assured:5.1.1'
testImplementation 'org.springframework.batch:spring-batch-test'
}

tasks.named('bootBuildImage') {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.data.jpa.repository.config.EnableJpaAuditing;
import org.springframework.scheduling.annotation.EnableScheduling;

@EnableScheduling
@SpringBootApplication
public class BackendApplication {

public static void main(String[] args) {
SpringApplication.run(BackendApplication.class, args);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package com.rollthedice.backend.batch;

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.batch.core.*;
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.batch.core.repository.JobExecutionAlreadyRunningException;
import org.springframework.batch.core.repository.JobInstanceAlreadyCompleteException;
import org.springframework.batch.core.repository.JobRepository;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

@Slf4j
@Component
@RequiredArgsConstructor
public class ScrapBatchScheduler {
private final JobLauncher jobLauncher;
private final ScrapJobConfig scrapJobConfig;
private final JobRepository jobRepository;
private final Step crawlingNewsUrlStep;
private final Step crawlingNewsContentStep;

// @Scheduled(cron = "0 0 6,12 * * *", zone = "Asia/Seoul") // DB 요금 문제로 개발 기간동안 주석 처리
public void runJob() {
JobParameters jobParameters = new JobParametersBuilder()
.addLong("time", System.currentTimeMillis())
.toJobParameters();
try {
jobLauncher.run(scrapJobConfig.scrapJob(jobRepository,
crawlingNewsUrlStep, crawlingNewsContentStep), jobParameters);
} catch (JobExecutionAlreadyRunningException | JobInstanceAlreadyCompleteException
| JobParametersInvalidException | org.springframework.batch.core.repository.JobRestartException e) {
log.error(e.getMessage());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package com.rollthedice.backend.batch;

import com.rollthedice.backend.batch.newsContentStep.PreSummarizedNewsDto;
import com.rollthedice.backend.batch.newsContentStep.UncrawledNewsContentReader;
import com.rollthedice.backend.batch.newsUrlStep.InitNewsDto;
import com.rollthedice.backend.batch.newsUrlStep.NewsUrlReader;
import com.rollthedice.backend.domain.news.contentqueue.ContentProducer;
import com.rollthedice.backend.domain.news.dto.ContentMessageDto;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.Step;
import org.springframework.batch.core.configuration.annotation.JobScope;
import org.springframework.batch.core.configuration.annotation.StepScope;
import org.springframework.batch.core.job.builder.JobBuilder;
import org.springframework.batch.core.launch.support.RunIdIncrementer;
import org.springframework.batch.core.repository.JobRepository;
import org.springframework.batch.core.step.builder.StepBuilder;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.database.JdbcBatchItemWriter;
import org.springframework.batch.item.database.builder.JdbcBatchItemWriterBuilder;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.transaction.PlatformTransactionManager;

import javax.sql.DataSource;

@Configuration
@RequiredArgsConstructor
public class ScrapJobConfig {

@Value("${batch.chunk-size}")
private int chunkSize;

private final DataSource dataSource;
private final NewsRepository newsRepository;
private final ContentProducer contentProducer;

@Bean
public Job scrapJob(JobRepository jobRepository,
Step crawlingNewsUrlStep, Step crawlingNewsContentStep) {
return new JobBuilder("scrapJob", jobRepository)
.incrementer(new RunIdIncrementer())
.start(crawlingNewsUrlStep)
.next(crawlingNewsContentStep)
.build();
}

@Bean
@JobScope
public Step crawlingNewsUrlStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsUrlStep", jobRepository)
.allowStartIfComplete(true)
.<InitNewsDto, InitNewsDto>chunk(30, transactionManager)
.reader(newsUrlReader())
.writer(newsUrlWriter())
.build();
}

@Bean
@StepScope
public ItemReader<InitNewsDto> newsUrlReader() {
return new NewsUrlReader();
}

@Bean
@StepScope
public JdbcBatchItemWriter<InitNewsDto> newsUrlWriter() {
return new JdbcBatchItemWriterBuilder<InitNewsDto>()
.dataSource(dataSource)
.sql("insert into news(url, category) values (:url, :newsCategory)")
.beanMapped()
.build();
}

@Bean
@JobScope
public Step crawlingNewsContentStep(JobRepository jobRepository,
PlatformTransactionManager transactionManager) {
return new StepBuilder("crawlingNewsContentStep", jobRepository)
.allowStartIfComplete(true)
.<PreSummarizedNewsDto, PreSummarizedNewsDto>chunk(chunkSize, transactionManager)
.reader(uncrawledNewsContentReader())
.processor(summarizeContentProcessor())
.writer(newsContentWriter())
.build();
}

@Bean
@StepScope
public ItemReader<PreSummarizedNewsDto> uncrawledNewsContentReader() {
return new UncrawledNewsContentReader(newsRepository);
}

@Bean
@StepScope
public ItemProcessor<PreSummarizedNewsDto, PreSummarizedNewsDto> summarizeContentProcessor() {
return dto -> {
contentProducer.sendMessage(new ContentMessageDto(dto.getId(), dto.getContent()));
return dto;
};
}

@Bean
@StepScope
public JdbcBatchItemWriter<PreSummarizedNewsDto> newsContentWriter() {
return new JdbcBatchItemWriterBuilder<PreSummarizedNewsDto>()
.dataSource(dataSource)
.sql("update news set title = :title, content = :content, post_date = :postDate" +
" where id = :id")
.beanMapped()
.build();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.rollthedice.backend.batch.newsContentStep;

import lombok.*;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class PreSummarizedNewsDto {
private Long id;
private String title;
private String content;
private String postDate;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package com.rollthedice.backend.batch.newsContentStep;

import com.rollthedice.backend.domain.news.entity.News;
import com.rollthedice.backend.domain.news.repository.NewsRepository;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;

import java.io.IOException;
import java.util.Iterator;

public class UncrawledNewsContentReader implements ItemReader<PreSummarizedNewsDto> {
private final NewsRepository newsRepository;
private Iterator<News> uncrawledNewsContents;

public UncrawledNewsContentReader(NewsRepository newsRepository) {
this.newsRepository = newsRepository;
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}

@Override
public PreSummarizedNewsDto read() throws IOException {
if (!hasNextUncrawledNews()) {
return null;
}
News news = uncrawledNewsContents.next();
Document doc = Jsoup.connect(news.getUrl()).get();
return getNewsContent(news, doc);
}

private boolean hasNextUncrawledNews() {
if (!uncrawledNewsContents.hasNext()) {
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator();
}
return uncrawledNewsContents.hasNext();
}

private PreSummarizedNewsDto getNewsContent(News news, Document doc) {
return PreSummarizedNewsDto.builder()
.id(news.getId())
.title(scrapTitle(doc))
.content(scrapContent(doc))
.postDate(scrapPostDate(doc))
.build();
}

private String scrapTitle(final Document doc) {
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2");
if (titleElement == null) {
titleElement = doc.selectFirst("#content > div.end_ct > div > h2");
}
if (titleElement != null) {
return titleElement.text();
}
return null;
}

private String scrapContent(final Document doc) {
Elements contentElements = doc.select("article#dic_area");
if (contentElements.isEmpty()) {
contentElements = doc.select("#articeBody");
}
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", "");
}

private String scrapPostDate(final Document doc) {
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span");
if (dateElement != null) {
return dateElement.attr("data-date-time");
} else {
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em");
if (altDateElement != null) {
return altDateElement.text();
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.rollthedice.backend.batch.newsUrlStep;

import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

@Getter
@NoArgsConstructor
@AllArgsConstructor
@Setter
public class InitNewsDto {
private String newsCategory;
private String url;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package com.rollthedice.backend.batch.newsUrlStep;

import com.rollthedice.backend.domain.news.entity.NewsCategory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.batch.item.ItemReader;
import org.springframework.beans.factory.annotation.Value;

import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;

public class NewsUrlReader implements ItemReader<InitNewsDto> {
@Value("${crawling.quantity}")
private int crawlingQuantity;

private final Iterator<NewsCategory> categories;
private final Queue<InitNewsDto> initNews = new LinkedList<>();

public NewsUrlReader() {
categories = Arrays.stream(NewsCategory.values()).collect(Collectors.toList()).iterator();
}

@Override
public InitNewsDto read() throws IOException {
while (initNews.isEmpty() && categories.hasNext()) {
NewsCategory category = categories.next();
initNews.addAll(scrapCategoryNews(category));
}
return initNews.poll();
}

private List<InitNewsDto> scrapCategoryNews(NewsCategory category) throws IOException {
Document doc = Jsoup.connect(category.getCategoryUrl()).get();
Elements newsList = doc.select(".sa_list").select("li");
if (newsList.size() < crawlingQuantity) {
return scrapNewsUrl(newsList.size(), newsList, category);
}
return scrapNewsUrl(crawlingQuantity, newsList, category);
}

private List<InitNewsDto> scrapNewsUrl(int quantity, Elements newsList, NewsCategory category) {
List<InitNewsDto> urls = new ArrayList<>();
for (int i = 0; i < quantity; i++) {
Element news = newsList.get(i);
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href");
urls.add(new InitNewsDto(category.getName(), url));
}
return urls;
}
}
Loading

0 comments on commit 08388e3

Please sign in to comment.